[Scout][CI] Retry only failed specs on lane retry (#274844)

steliosmavro · web-flow · commit 887a0d0a95e8 · 2026-06-25T18:38:47.000+03:00
## Summary On a Buildkite retry, re-run only the Scout spec files that failed instead of the whole config. Today, when a single spec fails, the entire config is marked failed and stored in build metadata, so a retry re-runs everything. That slows CI feedback and holds agents longer than needed. Closes elastic/appex-qa-team#858. ## How it works - After a config fails, parse the failed spec files from Playwright's JSON report and upload them as a build-scoped, per-config artifact, stamped with the attempt that produced it. - On retry, restore the snapshot from the immediately-preceding attempt and re-run only those spec files (passed as positional filters). Any miss falls back to re-running the whole config. - The attempt stamp handles the agent-lost case: if the previous attempt was lost before it could snapshot, there is nothing to restore for that attempt, so we safely re-run the whole config rather than trusting a stale snapshot. ## Notes - Retry unit is the spec **file** plus its server config (a spec can pass under one server config and fail under another). Re-running whole files also keeps `describe.serial` blocks intact, since their cases share state and must run together. ## Verified in CI Validated end-to-end in build [#459089](https://buildkite.com/elastic/kibana-pull-request/builds/459089) (both core cases observed on real lanes): **1. Real test failure → only the failed spec file re-runs.** Lane #14 failed on the first attempt; the retry restored the snapshot (`hosts/hosts_flyout.spec.ts`) and re-ran only that file (`Retrying 1 failed spec file(s)` → `8 passed`), not the whole `infra` config. <img width="844" height="664" alt="Screenshot 2026-06-25 at 10 11 16 AM" src="https://github.com/user-attachments/assets/3ff092be-0de9-4bfe-8bfc-44ea71548d13" /> **2. Agent lost → safe whole-config fallback.** Lane #58's first attempt was lost (exit -1) before it could snapshot, so the retry found nothing to restore and re-ran the whole config (`whole config will re-run` → `Running:`) instead of trusting stale data. <img width="742" height="507" alt="Screenshot 2026-06-25 at 10 15 46 AM" src="https://github.com/user-attachments/assets/c601ef29-3dc1-4300-b3cf-d491d02bf96c" />
diff --git a/.buildkite/scripts/steps/test/scout/run_test_lane.sh b/.buildkite/scripts/steps/test/scout/run_test_lane.sh
@@ -12,6 +12,7 @@ PASSED_INDICES=""
 PASSED=()
 FAILED=()
 SKIPPED=()
+RETRY_SPEC_FILES=()
 
 # Fail early if any of the given environment variable names are unset or empty
 check_required_env_vars() {
@@ -36,6 +37,70 @@ download_test_lane_loads() {
   download_tmp_artifact "$SCOUT_TEST_LANE_LOADS_PATH" . "$BUILDKITE_BUILD_ID"
 }
 
+# Per-config, attempt-stamped artifact path for a config's failed-spec snapshot.
+failed_specs_artifact_path() {
+  local idx="$1" attempt="$2"
+  local key="${BUILDKITE_STEP_KEY//[^a-zA-Z0-9_-]/_}"
+  echo ".scout/failed-specs-${key}-${idx}-attempt-${attempt}.json"
+}
+
+# Per-config path for Playwright's JSON report; set via PLAYWRIGHT_JSON_OUTPUT_FILE so it resolves
+# against this script's cwd rather than the config's directory.
+json_report_path() {
+  echo ".scout/test-results-${1}.json"
+}
+
+# After a config fails, persist its failed spec files so the next attempt can re-run only those.
+# Spec-file level (not per-test) so state-sharing blocks (e.g. describe.serial) re-run together.
+snapshot_failed_specs() {
+  local idx="$1"
+
+  local report
+  report="$(json_report_path "$idx")"
+  if [[ ! -f "$report" ]]; then
+    echo "No $report produced for index $idx; whole config will re-run on retry"
+    return
+  fi
+
+  # `|| true`: a non-zero jq exit (e.g. truncated report after a crash) must not abort the lane.
+  local failed_specs
+  failed_specs="$(jq -r '[.suites[]? | recurse(.suites[]?) | .specs[]? | select(.ok == false) | .file] | unique[]?' "$report" 2>/dev/null)" || true
+
+  if [[ -z "$failed_specs" ]]; then
+    echo "No failed spec files parsed for index $idx; whole config will re-run on retry"
+    return
+  fi
+
+  local artifact_path
+  artifact_path="$(failed_specs_artifact_path "$idx" "${BUILDKITE_RETRY_COUNT:-0}")"
+  printf '%s\n' "$failed_specs" > "$artifact_path"
+  # Best-effort: on upload failure the retry just re-runs the whole config.
+  buildkite-agent artifact upload "$artifact_path" || echo "Failed to upload failed-specs snapshot for index $idx"
+}
+
+# On retry, load the previous attempt's failed spec files into RETRY_SPEC_FILES. Returns non-zero
+# on any miss (first attempt, agent lost before snapshot, download failure) so we re-run the whole config.
+restore_failed_specs() {
+  local idx="$1"
+  RETRY_SPEC_FILES=()
+
+  local retry_count="${BUILDKITE_RETRY_COUNT:-0}"
+  [[ "$retry_count" -gt 0 ]] || return 1
+
+  local artifact_path
+  artifact_path="$(failed_specs_artifact_path "$idx" "$(( retry_count - 1 ))")"
+
+  # --include-retried-jobs: the snapshot was uploaded by the now-superseded previous attempt, whose
+  # artifacts buildkite-agent excludes by default.
+  if ! download_artifact "$artifact_path" . --include-retried-jobs >/dev/null 2>&1; then
+    return 1
+  fi
+  [[ -f "$artifact_path" ]] || return 1
+
+  mapfile -t RETRY_SPEC_FILES < "$artifact_path"
+  [[ ${#RETRY_SPEC_FILES[@]} -gt 0 ]]
+}
+
 # Read the comma-separated list of previously passed load indices from Buildkite metadata
 load_passed_indices() {
   PASSED_INDICES=$(buildkite-agent meta-data get "$PASSED_LOAD_INDICES_META_KEY" --default "" 2>/dev/null)
@@ -144,10 +209,19 @@ run_scout_tests() {
   local idx="$1"
   local config_path="$2"
 
-  echo "--- Running: $config_path"
+  # On retry, re-run only the previously-failed spec files (passed as positional path filters).
+  local spec_filter_args=()
+  if restore_failed_specs "$idx"; then
+    echo "--- Retrying ${#RETRY_SPEC_FILES[@]} failed spec file(s): $config_path"
+    printf '  %s\n' "${RETRY_SPEC_FILES[@]}"
+    spec_filter_args=("${RETRY_SPEC_FILES[@]}")
+  else
+    echo "--- Running: $config_path"
+  fi
 
   local pw_args=(
     test
+    "${spec_filter_args[@]+"${spec_filter_args[@]}"}"
     "--config=$config_path"
     "--grep=$PLAYWRIGHT_GREP_TAG"
     "--project=$PLAYWRIGHT_PROJECT"
@@ -158,6 +232,8 @@ run_scout_tests() {
     "SCOUT_TARGET_ARCH=$SCOUT_TEST_TARGET_ARCH"
     "SCOUT_TARGET_DOMAIN=$SCOUT_TEST_TARGET_DOMAIN"
     "NODE_OPTIONS=${NODE_OPTIONS:-} --require=@kbn/babel-register/install"
+    # Pin the JSON report to a path we control (see json_report_path).
+    "PLAYWRIGHT_JSON_OUTPUT_FILE=$(json_report_path "$idx")"
   )
 
   local start_time
@@ -184,6 +260,7 @@ run_scout_tests() {
       ;;
     *)
       upload_report_events "$config_path"
+      snapshot_failed_specs "$idx"
       FAILED+=("$config_path")
       echo "Exited with code $exit_code for $config_path"
       ;;