elastic · TamerlanG · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
@@ -26,6 +26,7 @@ spec:
         KIBANA_SLACK_NOTIFICATIONS_ENABLED: 'true'
         SLACK_NOTIFICATIONS_SKIP_FOR_RETRIES: 'true'
         SCOUT_REPORTER_ENABLED: 'true'
+        FTR_SMART_RETRY_ENABLED: 'false'
       allow_rebuilds: true
       branch_configuration: main 9.4 9.3 8.19
       default_branch: main

@@ -23,6 +23,7 @@ spec:
         KIBANA_GITHUB_BUILD_COMMIT_STATUS_ENABLED: 'true'
         GITHUB_BUILD_COMMIT_STATUS_CONTEXT: kibana-ci
         SCOUT_REPORTER_ENABLED: 'true'
+        FTR_SMART_RETRY_ENABLED: 'false'
       allow_rebuilds: true
       branch_configuration: ''
       cancel_intermediate_builds: true

@@ -3,6 +3,7 @@
 set -euo pipefail
 
 source .buildkite/scripts/steps/functional/common.sh
+source .buildkite/scripts/steps/test/ftr_smart_retry.sh
 
 BUILDKITE_PARALLEL_JOB=${BUILDKITE_PARALLEL_JOB:-}
 FTR_CONFIG_GROUP_KEY=${FTR_CONFIG_GROUP_KEY:-}
@@ -11,6 +12,11 @@ if [ "$FTR_CONFIG_GROUP_KEY" == "" ] && [ "$BUILDKITE_PARALLEL_JOB" == "" ]; the
   exit 1
 fi
 
+BAIL_ARG="--bail"
+if [[ "${FTR_SMART_RETRY_ENABLED:-}" =~ ^(1|true)$ ]]; then
+  BAIL_ARG=""
+fi
+
 EXTRA_ARGS=${FTR_EXTRA_ARGS:-}
 test -z "$EXTRA_ARGS" || buildkite-agent meta-data set "ftr-extra-args" "$EXTRA_ARGS"
 
@@ -52,7 +58,7 @@ while read -r config; do
     continue;
   fi
 
-  FULL_COMMAND="node scripts/functional_tests --bail --config $config $EXTRA_ARGS"
+  FULL_COMMAND="node scripts/functional_tests $BAIL_ARG --config $config $EXTRA_ARGS"
 
   # see if this config has already been executed successfully
   CONFIG_EXECUTION_KEY="${config}_executed"
@@ -90,9 +96,9 @@ while read -r config; do
   # prevent non-zero exit code from breaking the loop
   set +e;
   node ./scripts/functional_tests \
-    --bail \
     --kibana-install-dir "$KIBANA_BUILD_LOCATION" \
     --config="$config" \
+    $BAIL_ARG \
     "$EXTRA_ARGS"
   lastCode=$?
   set -e;
@@ -141,6 +147,12 @@ if [[ "$failedConfigs" ]]; then
   buildkite-agent meta-data set "$FAILED_CONFIGS_KEY" "$failedConfigs"
 fi
 
+
+if [[ "${FTR_SMART_RETRY_ENABLED:-}" =~ ^(1|true)$ ]]; then
+  store_failing_tests  # attempt 1: record what failed so the retry can verify recovery
+  apply_smart_retry    # attempt 2: mark green if all previously-failing tests explicitly passed
+fi
+
 echo "--- FTR configs complete"
 printf "%s\n" "${results[@]}"
 echo ""

@@ -0,0 +1,61 @@
+# Sourced by ftr_configs.sh — do not execute directly.
+# Reads/writes globals: exitCode, failedConfigs,
+# FAILED_CONFIGS_KEY, JOB, BUILDKITE_RETRY_COUNT.
+
+FAILED_TESTS_KEY="${BUILDKITE_STEP_ID}${FTR_CONFIG_GROUP_KEY}_failed_tests"
-FAILED_TESTS_KEY="${BUILDKITE_STEP_ID}${FTR_CONFIG_GROUP_KEY}_failed_tests"
+FAILED_TESTS_KEY="${BUILDKITE_STEP_ID:-}${FTR_CONFIG_GROUP_KEY:-}_failed_tests"
-FAILED_TESTS_KEY="${BUILDKITE_STEP_ID}${FTR_CONFIG_GROUP_KEY}_failed_tests"
+FAILED_TESTS_KEY="${BUILDKITE_STEP_ID:-}${FTR_CONFIG_GROUP_KEY:-}_failed_tests"
+retry_recovered=false
+
+# Called after attempt 1: stores failing test names so the retry can verify recovery.
+store_failing_tests() {
+  [[ -n "${KIBANA_FLAKY_TEST_RUNNER_CONFIG:-}" ]] && return
+  [[ "${BUILDKITE_RETRY_COUNT:-0}" != "0" ]] && return
+  [[ "$exitCode" == "0" ]] && return
+
+  local junitDir="target/junit/$JOB"
+  [[ -d "$junitDir" ]] || return
+
+  local failedTestNames
+  failedTestNames=$(node scripts/ftr_check_retry_result list-failures "$junitDir" 2>/dev/null || true)
+  if [[ "$failedTestNames" ]]; then
+    buildkite-agent meta-data set "$FAILED_TESTS_KEY" "$failedTestNames"
+    echo "Stored $(echo "$failedTestNames" | wc -l | tr -d ' ') previously-failing test name(s) for retry evaluation"
+  fi
+}
+
+# Called after attempt 2: marks the step green if all previously-failing tests explicitly passed.
+# On a third-or-later manual retry, logs that smart-retry is inactive.
+apply_smart_retry() {
+  [[ -n "${KIBANA_FLAKY_TEST_RUNNER_CONFIG:-}" ]] && return
+  [[ "$exitCode" == "0" ]] && return
+
+  local retryCount="${BUILDKITE_RETRY_COUNT:-0}"
+
+  if [[ "$retryCount" -ge "2" ]]; then
+    echo "--- [smart-retry] inactive on attempt $((retryCount + 1)) — only applies to the first automatic retry"
+    return
+  fi
+
+  [[ "$retryCount" != "1" ]] && return
+
+  local prevFailedTests
+  prevFailedTests=$(buildkite-agent meta-data get "$FAILED_TESTS_KEY" --default '' 2>/dev/null || true)
+  [[ "$prevFailedTests" ]] || return
+
+  local junitDir="target/junit/$JOB"
+
+  local intersectionCode
+  set +e
+  printf '%s' "$prevFailedTests" | node scripts/ftr_check_retry_result check-intersection \
+    --junit-dir "$junitDir" \
+    --prev-failures-stdin
+  intersectionCode=$?
+  set -e
+
+  if [[ "$intersectionCode" == "0" ]]; then
+    echo "--- [smart-retry] All previously-failing tests recovered on retry — marking step green"
+    exitCode=0
+    failedConfigs=""
+    retry_recovered=true
+    buildkite-agent meta-data set "$FAILED_CONFIGS_KEY" "" 2>/dev/null || true
+  fi
+}
@@ -27,133 +27,135 @@ const DEFAULT_PATTERNS = [Path.resolve(REPO_ROOT, 'target/junit/**/*.xml')];
 const DISABLE_MISSING_TEST_REPORT_ERRORS =
   process.env.DISABLE_MISSING_TEST_REPORT_ERRORS === 'true';
 
-run(
-  async ({ log, flags }) => {
-    const indexInEs = Boolean(flags['index-errors']);
-    const reportUpdate = Boolean(flags['report-update']);
-
-    let updateGithub = Boolean(flags['github-update']);
-    if (updateGithub && !process.env.GITHUB_TOKEN) {
-      throw createFailError(
-        'GITHUB_TOKEN environment variable must be set, otherwise use --no-github-update flag'
-      );
-    }
-
-    let branch: string = '';
-    let pipeline: string = '';
-    let prependTitle: string = '';
-    if (updateGithub) {
-      branch = process.env.BUILDKITE_BRANCH || '';
-      pipeline = process.env.BUILDKITE_PIPELINE_SLUG || '';
-      updateGithub = process.env.REPORT_FAILED_TESTS_TO_GITHUB === 'true';
-      prependTitle = process.env.PREPEND_FAILURE_TITLE || '';
-
-      if (!branch) {
+export function runFailedTestsReporterCli() {
+  run(
+    async ({ log, flags }) => {
+      const indexInEs = Boolean(flags['index-errors']);
+      const reportUpdate = Boolean(flags['report-update']);
+
+      let updateGithub = Boolean(flags['github-update']);
+      if (updateGithub && !process.env.GITHUB_TOKEN) {
         throw createFailError(
-          'Unable to determine originating branch from job name or other environment variables'
+          'GITHUB_TOKEN environment variable must be set, otherwise use --no-github-update flag'
         );
       }
-    }
 
-    const githubApi = new GithubApi({
-      log,
-      token: process.env.GITHUB_TOKEN,
-      dryRun: !updateGithub,
-    });
-
-    const bkMeta = getBuildkiteMetadata();
-
-    try {
-      const buildUrl = flags['build-url'] || (updateGithub ? '' : 'http://buildUrl');
-      if (typeof buildUrl !== 'string' || !buildUrl) {
-        throw createFlagError('Missing --build-url or process.env.BUILD_URL');
+      let branch: string = '';
+      let pipeline: string = '';
+      let prependTitle: string = '';
+      if (updateGithub) {
+        branch = process.env.BUILDKITE_BRANCH || '';
+        pipeline = process.env.BUILDKITE_PIPELINE_SLUG || '';
+        updateGithub = process.env.REPORT_FAILED_TESTS_TO_GITHUB === 'true';
+        prependTitle = process.env.PREPEND_FAILURE_TITLE || '';
+
+        if (!branch) {
+          throw createFailError(
+            'Unable to determine originating branch from job name or other environment variables'
+          );
+        }
       }
 
-      const patterns = (flags._.length ? flags._ : DEFAULT_PATTERNS).map((p) =>
-        normalize(Path.resolve(p))
-      );
-      log.info('Searching for reports at', patterns);
-      const reportPaths = await globby(patterns, {
-        absolute: true,
+      const githubApi = new GithubApi({
+        log,
+        token: process.env.GITHUB_TOKEN,
+        dryRun: !updateGithub,
       });
 
-      if (!reportPaths.length && DISABLE_MISSING_TEST_REPORT_ERRORS) {
-        // it is fine for code coverage to not have test results
-        return;
-      }
-
-      if (reportPaths.length) {
-        log.info('found', reportPaths.length, 'reports', reportPaths);
+      const bkMeta = getBuildkiteMetadata();
 
-        // Separate JUnit and Scout reports
-        const junitReports = reportPaths.filter((p) => p.endsWith('.xml'));
-        const scoutReports = reportPaths.filter((p) => p.endsWith('.ndjson'));
+      try {
+        const buildUrl = flags['build-url'] || (updateGithub ? '' : 'http://buildUrl');
+        if (typeof buildUrl !== 'string' || !buildUrl) {
+          throw createFlagError('Missing --build-url or process.env.BUILD_URL');
+        }
 
-        log.info(
-          'Processing',
-          junitReports.length,
-          'JUnit reports and',
-          scoutReports.length,
-          'Scout reports'
+        const patterns = (flags._.length ? flags._ : DEFAULT_PATTERNS).map((p) =>
+          normalize(Path.resolve(p))
         );
-
-        const existingIssues = new ExistingFailedTestIssues(log);
-
-        const processParams: ProcessReportsParams = {
-          log,
-          existingIssues,
-          buildUrl,
-          githubApi,
-          branch,
-          pipeline,
-          prependTitle,
-          updateGithub,
-          indexInEs,
-          reportUpdate,
-          bkMeta,
-        };
-
-        // Process FTR JUnit reports
-        await processJUnitReports(junitReports, processParams);
-
-        // Process Scout reports
-        await processScoutReports(scoutReports, processParams);
-
-        // Generate Scout test failure artifacts after reports are updated (GH issue info, html reports, etc.)
-        await generateScoutTestFailureArtifacts({ log, bkMeta });
+        log.info('Searching for reports at', patterns);
+        const reportPaths = await globby(patterns, {
+          absolute: true,
+        });
+
+        if (!reportPaths.length && DISABLE_MISSING_TEST_REPORT_ERRORS) {
+          // it is fine for code coverage to not have test results
+          return;
+        }
+
+        if (reportPaths.length) {
+          log.info('found', reportPaths.length, 'reports', reportPaths);
+
+          // Separate JUnit and Scout reports
+          const junitReports = reportPaths.filter((p) => p.endsWith('.xml'));
+          const scoutReports = reportPaths.filter((p) => p.endsWith('.ndjson'));
+
+          log.info(
+            'Processing',
+            junitReports.length,
+            'JUnit reports and',
+            scoutReports.length,
+            'Scout reports'
+          );
+
+          const existingIssues = new ExistingFailedTestIssues(log);
+
+          const processParams: ProcessReportsParams = {
+            log,
+            existingIssues,
+            buildUrl,
+            githubApi,
+            branch,
+            pipeline,
+            prependTitle,
+            updateGithub,
+            indexInEs,
+            reportUpdate,
+            bkMeta,
+          };
+
+          // Process FTR JUnit reports
+          await processJUnitReports(junitReports, processParams);
+
+          // Process Scout reports
+          await processScoutReports(scoutReports, processParams);
+
+          // Generate Scout test failure artifacts after reports are updated (GH issue info, html reports, etc.)
+          await generateScoutTestFailureArtifacts({ log, bkMeta });
+        }
+      } finally {
+        await CiStatsReporter.fromEnv(log).metrics([
+          {
+            group: 'github api request count',
+            id: `failed test reporter`,
+            value: githubApi.getRequestCount(),
+            meta: Object.fromEntries(
+              Object.entries(bkMeta).map(
+                ([k, v]) => [`buildkite${k[0].toUpperCase()}${k.slice(1)}`, v] as const
+              )
+            ),
+          },
+        ]);
       }
-    } finally {
-      await CiStatsReporter.fromEnv(log).metrics([
-        {
-          group: 'github api request count',
-          id: `failed test reporter`,
-          value: githubApi.getRequestCount(),
-          meta: Object.fromEntries(
-            Object.entries(bkMeta).map(
-              ([k, v]) => [`buildkite${k[0].toUpperCase()}${k.slice(1)}`, v] as const
-            )
-          ),
+    },
+    {
+      description: `a cli that opens issues or updates existing issues based on junit reports`,
+      flags: {
+        boolean: ['github-update', 'report-update'],
+        string: ['build-url'],
+        default: {
+          'github-update': true,
+          'report-update': true,
+          'index-errors': true,
+          'build-url': process.env.BUILD_URL,
         },
-      ]);
-    }
-  },
-  {
-    description: `a cli that opens issues or updates existing issues based on junit reports`,
-    flags: {
-      boolean: ['github-update', 'report-update'],
-      string: ['build-url'],
-      default: {
-        'github-update': true,
-        'report-update': true,
-        'index-errors': true,
-        'build-url': process.env.BUILD_URL,
-      },
-      help: `
+        help: `
         --no-github-update Execute the CLI without writing to Github
         --no-report-update Execute the CLI without writing to the JUnit reports
         --no-index-errors  Execute the CLI without indexing failures into Elasticsearch
         --build-url        URL of the failed build, defaults to process.env.BUILD_URL
       `,
-    },
-  }
-);
+      },
+    }
+  );
+}