From f5cc90363e25278472c5b929e2e569c3cd353b9a Mon Sep 17 00:00:00 2001 From: wenytang-ms Date: Wed, 20 May 2026 11:39:50 +0800 Subject: [PATCH 1/3] fix(test-plans): mitigate scheduled e2e-autotest flakiness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Triage of the last 8 scheduled e2e-autotest runs identified three failure categories: a real plan bug, LLM screenshot-based false downgrades, and real timing flakes. This change addresses all three. Category A — real plan bug * java-pack-help-center-webview was missing vscjava.vscode-java-pack from setup.extensions. On scheduled runs (no PR VSIX) java.welcome was unregistered and the open-help-center step silently timed out. This was the #1 failure across the last 8 nightly runs (7/8). Now installs the pack from the marketplace on schedule runs while still letting --vsix override on PR runs. Category B — LLM downgrade noise on ls-ready * Add skipLlmVerify: true (introduced in @vscjava/vscode-autotest 0.7.5) to every ls-ready step that has no structured verify* field. The waitForLanguageServer action is itself the authoritative deterministic check; the LLM was downgrading these whenever the status bar still showed background indexing ("Java: Searching... 0%"), even though the LS was fully functional. Affected: java-dependency-viewer, java-extension-pack, java-fresh-import, java-maven-resolve-type, java-maven, java-new-file-snippet, java-single-file, java-webview-migration. Category C — real timing flakes * java-test-runner: bump wait-test-discovery from 45s to 90s (the vscode-java-test discovery scan can take longer than 45s on a cold cache) and add retries: 1 to run-all-tests so a discovery-still-warming first invocation can retry. * java-maven-resolve-type: add retries: 1 to save-after-resolve so a slow Maven re-import on a cold cache (where the LS hasn't yet republished zero-errors at the time of save) can retry instead of failing the plan. Plans whose flaky steps already carry a structured verify* field (e.g. verify-completion with verifyCompletion: { notEmpty: true }, save-after-organize with verifyFile, verify-help-center-content with verifyWebview) no longer need plan changes because the framework auto-skip in @vscjava/vscode-autotest 0.7.5 already short-circuits the LLM re-check whenever any structured verifier is present. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test-plans/java-dependency-viewer.yaml | 5 +++++ test-plans/java-extension-pack.yaml | 3 +++ test-plans/java-fresh-import.yaml | 3 +++ test-plans/java-maven-resolve-type.yaml | 6 ++++++ test-plans/java-maven.yaml | 2 ++ test-plans/java-new-file-snippet.yaml | 2 ++ test-plans/java-pack-help-center-webview.yaml | 7 +++++++ test-plans/java-single-file.yaml | 2 ++ test-plans/java-test-runner.yaml | 7 ++++++- test-plans/java-webview-migration.yaml | 4 ++++ 10 files changed, 40 insertions(+), 1 deletion(-) diff --git a/test-plans/java-dependency-viewer.yaml b/test-plans/java-dependency-viewer.yaml index 89bbb5c0..258530ce 100644 --- a/test-plans/java-dependency-viewer.yaml +++ b/test-plans/java-dependency-viewer.yaml @@ -28,6 +28,11 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Java workspace has loaded; Explorer shows the project tree and Problems panel is settled" + # waitForLanguageServer is the authoritative deterministic check — the + # status bar can still flicker "Java: Searching... 0%" for background + # indexing right after the LS reports ready, which has historically + # caused LLM screenshot downgrades. Skip LLM here. + skipLlmVerify: true timeout: 120 # ── Open dependency view ───────────────────────────────── diff --git a/test-plans/java-extension-pack.yaml b/test-plans/java-extension-pack.yaml index 7e26ec98..05e7a9d0 100644 --- a/test-plans/java-extension-pack.yaml +++ b/test-plans/java-extension-pack.yaml @@ -30,6 +30,9 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Java workspace has loaded; Explorer shows the project tree and Problems panel is settled" + # waitForLanguageServer is the authoritative deterministic check — + # status-bar background indexing can cause spurious LLM downgrades. + skipLlmVerify: true timeout: 120 # ── Trigger Classpath configuration command ────────────── diff --git a/test-plans/java-fresh-import.yaml b/test-plans/java-fresh-import.yaml index e077e526..c4952942 100644 --- a/test-plans/java-fresh-import.yaml +++ b/test-plans/java-fresh-import.yaml @@ -45,6 +45,9 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "spring-petclinic project has been imported; Java extension is activated and ready for editing" + # waitForLanguageServer is authoritative — skip LLM screenshot re-check + # (status bar background indexing causes false downgrades). + skipLlmVerify: true timeout: 300 # ── Verify completion ──────────────────────────────────── diff --git a/test-plans/java-maven-resolve-type.yaml b/test-plans/java-maven-resolve-type.yaml index a136171e..ca42d3df 100644 --- a/test-plans/java-maven-resolve-type.yaml +++ b/test-plans/java-maven-resolve-type.yaml @@ -49,6 +49,8 @@ steps: action: "waitForLanguageServer" verify: "maven-resolve-type project has been imported; the Java extension is activated and pom.xml is visible in the Explorer" timeout: 180 + # waitForLanguageServer is authoritative — skip LLM screenshot re-check. + skipLlmVerify: true # ── Open Java file ────────────────────────────────────── - id: "open-app" @@ -161,6 +163,10 @@ steps: errors: 0 waitBefore: 20 timeout: 90 + # Maven re-import on a cold cache can take significantly longer than the + # waitBefore window; a single retry (with the LS likely already settled + # by then) recovers without inflating the happy-path wait further. + retries: 1 # After save, the language server publishes diagnostics (status bar updates # to 0 errors, verified deterministically above). However, on Linux runners diff --git a/test-plans/java-maven.yaml b/test-plans/java-maven.yaml index a87e17d3..0834e658 100644 --- a/test-plans/java-maven.yaml +++ b/test-plans/java-maven.yaml @@ -30,6 +30,8 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Maven workspace has loaded; the Java extension is initialized and pom.xml is visible in the Explorer (the Problems panel may briefly show diagnostics that are still being recomputed after import)" + # waitForLanguageServer is authoritative — skip LLM screenshot re-check. + skipLlmVerify: true timeout: 120 # ── Step 2: Open Java file and verify editing experience ───────────────── diff --git a/test-plans/java-new-file-snippet.yaml b/test-plans/java-new-file-snippet.yaml index 8b7ea695..3731340d 100644 --- a/test-plans/java-new-file-snippet.yaml +++ b/test-plans/java-new-file-snippet.yaml @@ -28,6 +28,8 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Java workspace has loaded for the simple-app project; no error notifications visible" + # waitForLanguageServer is authoritative — skip LLM screenshot re-check. + skipLlmVerify: true timeout: 120 # ── Step 9: Create new Java file ───────────────────────────── diff --git a/test-plans/java-pack-help-center-webview.yaml b/test-plans/java-pack-help-center-webview.yaml index 7023d3f6..3d749cd0 100644 --- a/test-plans/java-pack-help-center-webview.yaml +++ b/test-plans/java-pack-help-center-webview.yaml @@ -13,6 +13,13 @@ description: | setup: extension: "redhat.java" + # The Help Center webview lives in vscode-java-pack itself. On scheduled + # runs there is no built VSIX, so the pack must be installed from the + # marketplace; otherwise the `java.welcome` command is unregistered and + # the open-help-center step times out silently. On PR runs the + # build-pack job's VSIX takes precedence (--vsix overrides marketplace). + extensions: + - "vscjava.vscode-java-pack" vscodeVersion: "stable" timeout: 60 settings: diff --git a/test-plans/java-single-file.yaml b/test-plans/java-single-file.yaml index d66b634b..8d7e36bc 100644 --- a/test-plans/java-single-file.yaml +++ b/test-plans/java-single-file.yaml @@ -31,6 +31,8 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Java extension has activated for the single-file workspace; no error notifications are visible" + # waitForLanguageServer is authoritative — skip LLM screenshot re-check. + skipLlmVerify: true timeout: 120 # ── Step 2: Open Java file ────────────────────────────── diff --git a/test-plans/java-test-runner.yaml b/test-plans/java-test-runner.yaml index 06021602..cb52a146 100644 --- a/test-plans/java-test-runner.yaml +++ b/test-plans/java-test-runner.yaml @@ -52,7 +52,7 @@ steps: # ready — discovery is asynchronous and Test Explorer is initially empty. # On cold-cache CI runners 20s is sometimes too short; bump to 45s. - id: "wait-test-discovery" - action: "wait 45 seconds" + action: "wait 90 seconds" # ── Step 2: Run tests via Java Test Runner palette command ─────── # autotest 0.7.1 ships `openTestExplorer`/`runAllTests` actions wired to @@ -72,6 +72,11 @@ steps: action: "run command Java: Run Tests" verify: "Java: Run Tests command has been invoked from the palette; the Java Test Runner extension has responded (this may show as a Testing view becoming active, a run indicator in the status bar, or an informational notification such as 'No tests found in this file' if discovery is still in progress — all of these indicate the command executed successfully)" waitBefore: 3 + # Test discovery is asynchronous in vscode-java-test; on cold-cache + # CI runners the first Run Tests invocation can land before discovery + # completes ("No tests have been found"). Allow one retry — by the + # second attempt the discovery cache is usually warm. + retries: 1 - id: "wait-test-complete" action: "wait 45 seconds" diff --git a/test-plans/java-webview-migration.yaml b/test-plans/java-webview-migration.yaml index 49aa321a..48933316 100644 --- a/test-plans/java-webview-migration.yaml +++ b/test-plans/java-webview-migration.yaml @@ -55,6 +55,10 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Maven salut workspace has loaded; the Java extension and the pack webview commands are ready" + # waitForLanguageServer is authoritative — skip LLM screenshot re-check + # (status bar may still show "Java: Searching... 0%" for background + # indexing right after the LS reports ready). + skipLlmVerify: true timeout: 180 # ══════════════════════════════════════════════════════════════════════ From a64ef4ad7f38f2e5e1eb9a3d84345285072f9d77 Mon Sep 17 00:00:00 2001 From: wenytang-ms Date: Wed, 20 May 2026 14:16:44 +0800 Subject: [PATCH 2/3] fix(test-plans): restrict skipLlmVerify to G1/G4; add retries for cold-cache flakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the over-broad framework auto-skip (any structured verify -> no LLM) that was landed in autotest v0.7.5/0.7.6. LLM screenshot verification is the anti-silent-pass safety net and must stay enabled on steps where the screenshot carries unique signal (popup visibility, decoration lag, panel content). Final policy: - skipLlmVerify=true on Group 1 (16 ls-ready steps): waitForLanguageServer polls the same status bar text the LLM would read, so LLM adds zero signal. - skipLlmVerify=true on Group 4 (3 disk-write steps: save-after-organize, add-gson-dependency, create-formatter-profile): action mutates a file not open in any editor; before/after screenshots are by-design identical and LLM always downgrades. verifyFile from disk is the authoritative signal. - retries: 1 on 8 verify-completion steps to mitigate cold-cache 'Loading...' LLM downgrades while keeping the screenshot check enabled. - retries: 1 on java-maven-resolve-type save-after-resolve (kept from prior commit) for Maven indexer warm-up. - Wait bump 45 -> 90s on java-test-runner wait-test-discovery (kept). - java-pack-help-center-webview setup.extensions hard-requires java-pack (kept) — fixes the real bug (5/8 failures). LLM coverage preserved on verify-completion (popup visibility), verifyEditor (guards against page-wide DOM stale-tab fallback), verifyProblems (diagnostics red squiggle lag) and verifyWebview (visual rendering). Requires autotest >= 0.7.7 to honor skipLlmVerify without the auto-skip side effect. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test-plans/java-basic-editing.yaml | 5 +++++ test-plans/java-debugger.yaml | 1 + test-plans/java-fresh-import.yaml | 3 +++ test-plans/java-gradle-java25.yaml | 3 +++ test-plans/java-gradle.yaml | 1 + test-plans/java-maven-java25.yaml | 3 +++ test-plans/java-maven-multimodule.yaml | 5 +++++ test-plans/java-maven-resolve-type.yaml | 4 ++++ test-plans/java-maven.yaml | 2 ++ test-plans/java-single-file.yaml | 2 ++ test-plans/java-single-no-workspace.yaml | 3 +++ test-plans/java-test-runner.yaml | 1 + test-plans/java-webview-migration.yaml | 3 +++ 13 files changed, 36 insertions(+) diff --git a/test-plans/java-basic-editing.yaml b/test-plans/java-basic-editing.yaml index 0093d73d..c1f8377a 100644 --- a/test-plans/java-basic-editing.yaml +++ b/test-plans/java-basic-editing.yaml @@ -48,6 +48,7 @@ steps: - id: "ls-ready" action: "waitForLanguageServer" verify: "Java extension has activated and the simple-app project tree is visible in the Explorer sidebar" + skipLlmVerify: true # waitForLanguageServer reads the same status bar text the LLM would inspect verifyProblems: errors: 2 timeout: 120 @@ -140,6 +141,10 @@ steps: path: "~/src/app/App.java" contains: "import java.io.File" timeout: 10 + # Save All has no on-screen change (tab dirty dot clears on the saved + # tab but the active editor isn't the saved file). LLM downgrades on + # before==after by-design. + skipLlmVerify: true # ── Step 8: Rename Symbol (F2) ────────────────────────────── - id: "close-all-before-rename" diff --git a/test-plans/java-debugger.yaml b/test-plans/java-debugger.yaml index a30051c1..92e241b2 100644 --- a/test-plans/java-debugger.yaml +++ b/test-plans/java-debugger.yaml @@ -44,6 +44,7 @@ steps: verify: "Java workspace has loaded; Problems panel shows no errors" verifyProblems: errors: 0 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar timeout: 120 # ── Open App.java ──────────────────────────────────────── diff --git a/test-plans/java-fresh-import.yaml b/test-plans/java-fresh-import.yaml index c4952942..b12858fc 100644 --- a/test-plans/java-fresh-import.yaml +++ b/test-plans/java-fresh-import.yaml @@ -70,3 +70,6 @@ steps: verifyCompletion: notEmpty: true waitBefore: 5 + # LLM may downgrade if it sees "Loading..." spinner on cold cache; retry + # gives the LS a warmed cache so the popup is fully rendered. + retries: 1 diff --git a/test-plans/java-gradle-java25.yaml b/test-plans/java-gradle-java25.yaml index 2b8392c9..2462f6e1 100644 --- a/test-plans/java-gradle-java25.yaml +++ b/test-plans/java-gradle-java25.yaml @@ -36,6 +36,7 @@ steps: verifyProblems: errors: 0 timeout: 300 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 2: Open Java file ─────────────────────────────── # wiki: "Open Foo.java, make sure the editing experience is correctly working" @@ -54,6 +55,8 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # ── Step 4: Verify editing ──────────────────────────────── - id: "goto-line" diff --git a/test-plans/java-gradle.yaml b/test-plans/java-gradle.yaml index f6f488cc..02416b46 100644 --- a/test-plans/java-gradle.yaml +++ b/test-plans/java-gradle.yaml @@ -36,6 +36,7 @@ steps: verifyProblems: errors: 0 timeout: 300 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 2: Open Foo.java and verify editing experience ─ # wiki: "Open Foo.java file, make sure the editing experience diff --git a/test-plans/java-maven-java25.yaml b/test-plans/java-maven-java25.yaml index 25c5f444..2a785235 100644 --- a/test-plans/java-maven-java25.yaml +++ b/test-plans/java-maven-java25.yaml @@ -34,6 +34,7 @@ steps: verifyProblems: errors: 0 timeout: 180 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 2: Open Java file ─────────────────────────────── # wiki: "Open Bar.java, make sure the editing experience is correctly working" @@ -52,6 +53,8 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # ── Step 4: Verify editing ──────────────────────────────── - id: "goto-line" diff --git a/test-plans/java-maven-multimodule.yaml b/test-plans/java-maven-multimodule.yaml index ff0e2f89..efe24dec 100644 --- a/test-plans/java-maven-multimodule.yaml +++ b/test-plans/java-maven-multimodule.yaml @@ -32,6 +32,7 @@ steps: action: "waitForLanguageServer" verify: "Multimodule Maven workspace has loaded; the Java extension is initialized for the project with module1 and module2 visible in the Explorer (the Problems panel may briefly show diagnostics that are still being recomputed after import — the verifyProblems checks below pin the final state)" timeout: 180 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 2: Verify module1 Foo.java ────────────────────── # wiki: "make sure the editing experience is correctly working @@ -48,6 +49,8 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # Close module1's tab first so the next `open file Foo.java` request # disambiguates to module2/Foo.java rather than re-focusing the already- @@ -69,3 +72,5 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 diff --git a/test-plans/java-maven-resolve-type.yaml b/test-plans/java-maven-resolve-type.yaml index ca42d3df..477bc100 100644 --- a/test-plans/java-maven-resolve-type.yaml +++ b/test-plans/java-maven-resolve-type.yaml @@ -105,6 +105,10 @@ steps: path: "~/pom.xml" contains: "com.google.code.gson" waitBefore: 2 + # Disk-only insertLineInFile: pom.xml isn't in any editor, so before/after + # screenshots are necessarily identical. LLM always downgrades; verifyFile + # reading from disk is the only meaningful signal. + skipLlmVerify: true # Re-open pom.xml so the AFTER screenshot shows the new # block. Loading fresh from disk avoids any in-memory/disk mismatch. diff --git a/test-plans/java-maven.yaml b/test-plans/java-maven.yaml index 0834e658..c2f198bb 100644 --- a/test-plans/java-maven.yaml +++ b/test-plans/java-maven.yaml @@ -50,6 +50,8 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # 2c. Verify cursor navigation (goToLine) - id: "goto-line" diff --git a/test-plans/java-single-file.yaml b/test-plans/java-single-file.yaml index 8d7e36bc..d41867a6 100644 --- a/test-plans/java-single-file.yaml +++ b/test-plans/java-single-file.yaml @@ -52,6 +52,8 @@ steps: verifyCompletion: notEmpty: true waitBefore: 8 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # ── Step 4: Verify basic editing ──────────────────────────────── - id: "goto-main" diff --git a/test-plans/java-single-no-workspace.yaml b/test-plans/java-single-no-workspace.yaml index eec9557e..87638cbb 100644 --- a/test-plans/java-single-no-workspace.yaml +++ b/test-plans/java-single-no-workspace.yaml @@ -31,6 +31,7 @@ steps: action: "waitForLanguageServer" verify: "Java extension has activated for the single-file (no-workspace) mode; no error notifications visible" timeout: 120 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 2: Verify file is open ────────────────────────────── - id: "verify-file-open" @@ -52,6 +53,8 @@ steps: notEmpty: true waitBefore: 8 timeout: 30 + # Retry once on cold-cache "Loading..." LLM downgrades. + retries: 1 # ── Step 4: Verify editing ──────────────────────────────────── - id: "goto-line" diff --git a/test-plans/java-test-runner.yaml b/test-plans/java-test-runner.yaml index cb52a146..ae12f83f 100644 --- a/test-plans/java-test-runner.yaml +++ b/test-plans/java-test-runner.yaml @@ -38,6 +38,7 @@ steps: action: "waitForLanguageServer" verify: "maven-junit workspace has loaded; the Java extension is initialized for the project" timeout: 300 + skipLlmVerify: true # waitForLanguageServer is authoritative; LLM only sees the same status bar # ── Step 1: Open test file so CodeLens can render ─────────── - id: "open-test-file" diff --git a/test-plans/java-webview-migration.yaml b/test-plans/java-webview-migration.yaml index 48933316..ec1eeeb1 100644 --- a/test-plans/java-webview-migration.yaml +++ b/test-plans/java-webview-migration.yaml @@ -260,6 +260,9 @@ steps: verifyFile: path: "~/.vscode/java-formatter.xml" contains: "CodeFormatterProfile" + # Disk-only insertLineInFile against a file not in any editor. + # before/after screenshots are by-design identical. + skipLlmVerify: true - id: "close-profile-file-before-formatter" action: "run command View: Close All Editors" From ec5a5e5eb6aeb708eb246fc6853d77c656eb8085 Mon Sep 17 00:00:00 2001 From: wenytang-ms Date: Wed, 20 May 2026 14:32:58 +0800 Subject: [PATCH 3/3] ci: retrigger E2E AutoTest against @vscjava/vscode-autotest@0.7.8 The 0.7.7 release did not actually honor skipLlmVerify because planParser dropped the field on deserialize. 0.7.8 contains the parser fix; this empty commit restarts CI so the matrix installs the correct version. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>