Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5201524
chore: remove old comments that references file that doesn't exist an…
TamerlanG May 16, 2026
29689fd
feat(ci): add ftr retry result checker to kbn-failed-test-reporter-cli
TamerlanG May 16, 2026
f89d58a
feat(ci): mark FTR retry green when previously-failing tests recover
TamerlanG May 16, 2026
54726fb
test(ci): TEMP add retry-validation fixture — DELETE BEFORE MERGE
TamerlanG May 16, 2026
8403a73
Merge branch 'main' into ftr/smart-retry
TamerlanG May 16, 2026
1be5cd8
Revert "test(ci): TEMP add retry-validation fixture — DELETE BEFORE M…
TamerlanG May 16, 2026
f64246a
Merge branch 'main' into ftr/smart-retry
TamerlanG May 18, 2026
27b3524
[CI] Add job annotation to FTR configs summary
TamerlanG May 18, 2026
443e1cf
Reapply "test(ci): TEMP add retry-validation fixture — DELETE BEFORE …
TamerlanG May 18, 2026
093dc39
Merge branch 'main' into ftr/smart-retry
TamerlanG May 21, 2026
bb84df5
Merge branch 'main' into ftr/smart-retry
TamerlanG May 21, 2026
221df45
Merge branch 'main' into ftr/smart-retry
TamerlanG May 22, 2026
0cadf4a
Merge branch 'main' into ftr/smart-retry
TamerlanG May 25, 2026
5f661e0
Merge branch 'main' into ftr/smart-retry
TamerlanG May 26, 2026
2522c6c
Merge branch 'main' into ftr/smart-retry
TamerlanG May 26, 2026
11841f7
improve job annotation
TamerlanG May 26, 2026
6931aeb
remove bail
TamerlanG May 26, 2026
b137ca2
remove view logs link from job annotation
TamerlanG May 26, 2026
ad1ae9a
show failing test names per config in job annotation
TamerlanG May 26, 2026
7f2c559
Revert "Reapply "test(ci): TEMP add retry-validation fixture — DELETE…
TamerlanG May 26, 2026
b9aa8a4
refactor(ci): simplify ftr_configs.sh annotation and failure extraction
TamerlanG May 27, 2026
8ab9a58
feat(ci): verify explicit passes on retry instead of absence of failure
TamerlanG May 27, 2026
924635c
fix(ci): guard scout reporter error, log smart-retry inactivity, clar…
TamerlanG May 27, 2026
989bd71
refactor(ci): split ftr_configs.sh into focused helper files
TamerlanG May 27, 2026
8238bd3
refactor(ci): move XML diff dance and temp-file plumbing into the Nod…
TamerlanG May 27, 2026
65bc3fd
Merge branch 'main' into ftr/smart-retry
TamerlanG May 27, 2026
84f0b90
chore(ci): remove job annotation from smart-retry PR
TamerlanG May 27, 2026
c3712f9
refactor(ci): remove dead computeIntersection export and initialize r…
TamerlanG May 27, 2026
69c8326
Reapply "test(ci): TEMP add retry-validation fixture — DELETE BEFORE …
TamerlanG May 18, 2026
c8a62ec
Revert "chore: remove old comments that references file that doesn't …
TamerlanG May 28, 2026
2193bfb
Revert "Reapply "test(ci): TEMP add retry-validation fixture — DELETE…
TamerlanG May 28, 2026
a1c1565
bring back verbose version
TamerlanG May 28, 2026
d51172c
revert comments
TamerlanG May 28, 2026
3cfcdfb
refactor(ci): move FAILED_TESTS_KEY and retry_recovered into ftr_smar…
TamerlanG May 28, 2026
6483777
add whitesapce
TamerlanG May 28, 2026
fcdfba7
Reapply "Reapply "test(ci): TEMP add retry-validation fixture — DELET…
TamerlanG May 28, 2026
ac19c1a
Merge branch 'main' into ftr/smart-retry
TamerlanG May 29, 2026
0ba5cbb
put this all behind an env flag
TamerlanG May 30, 2026
bad92a9
put bail behind a env variable too
TamerlanG May 30, 2026
be25fc6
Update .buildkite/scripts/steps/test/ftr_configs.sh
TamerlanG May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ spec:
KIBANA_SLACK_NOTIFICATIONS_ENABLED: 'true'
SLACK_NOTIFICATIONS_SKIP_FOR_RETRIES: 'true'
SCOUT_REPORTER_ENABLED: 'true'
FTR_SMART_RETRY_ENABLED: 'false'
allow_rebuilds: true
branch_configuration: main 9.4 9.3 8.19
default_branch: main
Expand Down
1 change: 1 addition & 0 deletions .buildkite/pipeline-resource-definitions/kibana-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ spec:
KIBANA_GITHUB_BUILD_COMMIT_STATUS_ENABLED: 'true'
GITHUB_BUILD_COMMIT_STATUS_CONTEXT: kibana-ci
SCOUT_REPORTER_ENABLED: 'true'
FTR_SMART_RETRY_ENABLED: 'false'
allow_rebuilds: true
branch_configuration: ''
cancel_intermediate_builds: true
Expand Down
16 changes: 14 additions & 2 deletions .buildkite/scripts/steps/test/ftr_configs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
set -euo pipefail

source .buildkite/scripts/steps/functional/common.sh
source .buildkite/scripts/steps/test/ftr_smart_retry.sh

BUILDKITE_PARALLEL_JOB=${BUILDKITE_PARALLEL_JOB:-}
FTR_CONFIG_GROUP_KEY=${FTR_CONFIG_GROUP_KEY:-}
Expand All @@ -11,6 +12,11 @@ if [ "$FTR_CONFIG_GROUP_KEY" == "" ] && [ "$BUILDKITE_PARALLEL_JOB" == "" ]; the
exit 1
fi

BAIL_ARG="--bail"
if [[ "${FTR_SMART_RETRY_ENABLED:-}" =~ ^(1|true)$ ]]; then
BAIL_ARG=""
fi

EXTRA_ARGS=${FTR_EXTRA_ARGS:-}
test -z "$EXTRA_ARGS" || buildkite-agent meta-data set "ftr-extra-args" "$EXTRA_ARGS"

Expand Down Expand Up @@ -52,7 +58,7 @@ while read -r config; do
continue;
fi

FULL_COMMAND="node scripts/functional_tests --bail --config $config $EXTRA_ARGS"
FULL_COMMAND="node scripts/functional_tests $BAIL_ARG --config $config $EXTRA_ARGS"

# see if this config has already been executed successfully
CONFIG_EXECUTION_KEY="${config}_executed"
Expand Down Expand Up @@ -90,9 +96,9 @@ while read -r config; do
# prevent non-zero exit code from breaking the loop
set +e;
node ./scripts/functional_tests \
--bail \
--kibana-install-dir "$KIBANA_BUILD_LOCATION" \
--config="$config" \
$BAIL_ARG \
"$EXTRA_ARGS"
lastCode=$?
set -e;
Expand Down Expand Up @@ -141,6 +147,12 @@ if [[ "$failedConfigs" ]]; then
buildkite-agent meta-data set "$FAILED_CONFIGS_KEY" "$failedConfigs"
fi


if [[ "${FTR_SMART_RETRY_ENABLED:-}" =~ ^(1|true)$ ]]; then
store_failing_tests # attempt 1: record what failed so the retry can verify recovery
apply_smart_retry # attempt 2: mark green if all previously-failing tests explicitly passed
fi

echo "--- FTR configs complete"
printf "%s\n" "${results[@]}"
echo ""
Expand Down
61 changes: 61 additions & 0 deletions .buildkite/scripts/steps/test/ftr_smart_retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Sourced by ftr_configs.sh — do not execute directly.
# Reads/writes globals: exitCode, failedConfigs,
# FAILED_CONFIGS_KEY, JOB, BUILDKITE_RETRY_COUNT.

FAILED_TESTS_KEY="${BUILDKITE_STEP_ID}${FTR_CONFIG_GROUP_KEY}_failed_tests"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correctness/regression: this line runs at source-time (top of ftr_configs.sh line 6) which is before the defensive FTR_CONFIG_GROUP_KEY=${FTR_CONFIG_GROUP_KEY:-} assignment on line 9 of ftr_configs.sh. Because ftr_configs.sh enables set -euo pipefail before sourcing, expanding ${FTR_CONFIG_GROUP_KEY} (no :- default) under set -u will abort the entire step with FTR_CONFIG_GROUP_KEY: unbound variable in any run where that env var is not set on the agent — exactly the case the existing FTR_CONFIG_GROUP_KEY=${FTR_CONFIG_GROUP_KEY:-} line was written to guard against. The standard "FTR Configs" steps in .buildkite/pipeline-utils/ci-stats/pick_test_group_run_order/steps.ts:82 set it explicitly, but other callers (e.g. parallel-job paths handled at lines 10–13 of ftr_configs.sh) and local runs do not.

Easiest fix: add a :- default so source-time evaluation is safe.

Suggested change
FAILED_TESTS_KEY="${BUILDKITE_STEP_ID}${FTR_CONFIG_GROUP_KEY}_failed_tests"
FAILED_TESTS_KEY="${BUILDKITE_STEP_ID:-}${FTR_CONFIG_GROUP_KEY:-}_failed_tests"

retry_recovered=false

# Called after attempt 1: stores failing test names so the retry can verify recovery.
store_failing_tests() {
[[ -n "${KIBANA_FLAKY_TEST_RUNNER_CONFIG:-}" ]] && return
[[ "${BUILDKITE_RETRY_COUNT:-0}" != "0" ]] && return
[[ "$exitCode" == "0" ]] && return

local junitDir="target/junit/$JOB"
[[ -d "$junitDir" ]] || return

local failedTestNames
failedTestNames=$(node scripts/ftr_check_retry_result list-failures "$junitDir" 2>/dev/null || true)
if [[ "$failedTestNames" ]]; then
buildkite-agent meta-data set "$FAILED_TESTS_KEY" "$failedTestNames"
echo "Stored $(echo "$failedTestNames" | wc -l | tr -d ' ') previously-failing test name(s) for retry evaluation"
fi
}

# Called after attempt 2: marks the step green if all previously-failing tests explicitly passed.
# On a third-or-later manual retry, logs that smart-retry is inactive.
apply_smart_retry() {
[[ -n "${KIBANA_FLAKY_TEST_RUNNER_CONFIG:-}" ]] && return
[[ "$exitCode" == "0" ]] && return

local retryCount="${BUILDKITE_RETRY_COUNT:-0}"

if [[ "$retryCount" -ge "2" ]]; then
echo "--- [smart-retry] inactive on attempt $((retryCount + 1)) — only applies to the first automatic retry"
return
fi

[[ "$retryCount" != "1" ]] && return

local prevFailedTests
prevFailedTests=$(buildkite-agent meta-data get "$FAILED_TESTS_KEY" --default '' 2>/dev/null || true)
[[ "$prevFailedTests" ]] || return

local junitDir="target/junit/$JOB"

local intersectionCode
set +e
printf '%s' "$prevFailedTests" | node scripts/ftr_check_retry_result check-intersection \
--junit-dir "$junitDir" \
--prev-failures-stdin
intersectionCode=$?
set -e

if [[ "$intersectionCode" == "0" ]]; then
echo "--- [smart-retry] All previously-failing tests recovered on retry — marking step green"
exitCode=0
failedConfigs=""
retry_recovered=true
Comment thread
TamerlanG marked this conversation as resolved.
buildkite-agent meta-data set "$FAILED_CONFIGS_KEY" "" 2>/dev/null || true
fi
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,133 +27,135 @@ const DEFAULT_PATTERNS = [Path.resolve(REPO_ROOT, 'target/junit/**/*.xml')];
const DISABLE_MISSING_TEST_REPORT_ERRORS =
process.env.DISABLE_MISSING_TEST_REPORT_ERRORS === 'true';

run(
async ({ log, flags }) => {
const indexInEs = Boolean(flags['index-errors']);
const reportUpdate = Boolean(flags['report-update']);

let updateGithub = Boolean(flags['github-update']);
if (updateGithub && !process.env.GITHUB_TOKEN) {
throw createFailError(
'GITHUB_TOKEN environment variable must be set, otherwise use --no-github-update flag'
);
}

let branch: string = '';
let pipeline: string = '';
let prependTitle: string = '';
if (updateGithub) {
branch = process.env.BUILDKITE_BRANCH || '';
pipeline = process.env.BUILDKITE_PIPELINE_SLUG || '';
updateGithub = process.env.REPORT_FAILED_TESTS_TO_GITHUB === 'true';
prependTitle = process.env.PREPEND_FAILURE_TITLE || '';

if (!branch) {
export function runFailedTestsReporterCli() {
run(
async ({ log, flags }) => {
const indexInEs = Boolean(flags['index-errors']);
const reportUpdate = Boolean(flags['report-update']);

let updateGithub = Boolean(flags['github-update']);
if (updateGithub && !process.env.GITHUB_TOKEN) {
throw createFailError(
'Unable to determine originating branch from job name or other environment variables'
'GITHUB_TOKEN environment variable must be set, otherwise use --no-github-update flag'
);
}
}

const githubApi = new GithubApi({
log,
token: process.env.GITHUB_TOKEN,
dryRun: !updateGithub,
});

const bkMeta = getBuildkiteMetadata();

try {
const buildUrl = flags['build-url'] || (updateGithub ? '' : 'http://buildUrl');
if (typeof buildUrl !== 'string' || !buildUrl) {
throw createFlagError('Missing --build-url or process.env.BUILD_URL');
let branch: string = '';
let pipeline: string = '';
let prependTitle: string = '';
if (updateGithub) {
branch = process.env.BUILDKITE_BRANCH || '';
pipeline = process.env.BUILDKITE_PIPELINE_SLUG || '';
updateGithub = process.env.REPORT_FAILED_TESTS_TO_GITHUB === 'true';
prependTitle = process.env.PREPEND_FAILURE_TITLE || '';

if (!branch) {
throw createFailError(
'Unable to determine originating branch from job name or other environment variables'
);
}
}

const patterns = (flags._.length ? flags._ : DEFAULT_PATTERNS).map((p) =>
normalize(Path.resolve(p))
);
log.info('Searching for reports at', patterns);
const reportPaths = await globby(patterns, {
absolute: true,
const githubApi = new GithubApi({
log,
token: process.env.GITHUB_TOKEN,
dryRun: !updateGithub,
});

if (!reportPaths.length && DISABLE_MISSING_TEST_REPORT_ERRORS) {
// it is fine for code coverage to not have test results
return;
}

if (reportPaths.length) {
log.info('found', reportPaths.length, 'reports', reportPaths);
const bkMeta = getBuildkiteMetadata();

// Separate JUnit and Scout reports
const junitReports = reportPaths.filter((p) => p.endsWith('.xml'));
const scoutReports = reportPaths.filter((p) => p.endsWith('.ndjson'));
try {
const buildUrl = flags['build-url'] || (updateGithub ? '' : 'http://buildUrl');
if (typeof buildUrl !== 'string' || !buildUrl) {
throw createFlagError('Missing --build-url or process.env.BUILD_URL');
}

log.info(
'Processing',
junitReports.length,
'JUnit reports and',
scoutReports.length,
'Scout reports'
const patterns = (flags._.length ? flags._ : DEFAULT_PATTERNS).map((p) =>
normalize(Path.resolve(p))
);

const existingIssues = new ExistingFailedTestIssues(log);

const processParams: ProcessReportsParams = {
log,
existingIssues,
buildUrl,
githubApi,
branch,
pipeline,
prependTitle,
updateGithub,
indexInEs,
reportUpdate,
bkMeta,
};

// Process FTR JUnit reports
await processJUnitReports(junitReports, processParams);

// Process Scout reports
await processScoutReports(scoutReports, processParams);

// Generate Scout test failure artifacts after reports are updated (GH issue info, html reports, etc.)
await generateScoutTestFailureArtifacts({ log, bkMeta });
log.info('Searching for reports at', patterns);
const reportPaths = await globby(patterns, {
absolute: true,
});

if (!reportPaths.length && DISABLE_MISSING_TEST_REPORT_ERRORS) {
// it is fine for code coverage to not have test results
return;
}

if (reportPaths.length) {
log.info('found', reportPaths.length, 'reports', reportPaths);

// Separate JUnit and Scout reports
const junitReports = reportPaths.filter((p) => p.endsWith('.xml'));
const scoutReports = reportPaths.filter((p) => p.endsWith('.ndjson'));

log.info(
'Processing',
junitReports.length,
'JUnit reports and',
scoutReports.length,
'Scout reports'
);

const existingIssues = new ExistingFailedTestIssues(log);

const processParams: ProcessReportsParams = {
log,
existingIssues,
buildUrl,
githubApi,
branch,
pipeline,
prependTitle,
updateGithub,
indexInEs,
reportUpdate,
bkMeta,
};

// Process FTR JUnit reports
await processJUnitReports(junitReports, processParams);

// Process Scout reports
await processScoutReports(scoutReports, processParams);

// Generate Scout test failure artifacts after reports are updated (GH issue info, html reports, etc.)
await generateScoutTestFailureArtifacts({ log, bkMeta });
}
} finally {
await CiStatsReporter.fromEnv(log).metrics([
{
group: 'github api request count',
id: `failed test reporter`,
value: githubApi.getRequestCount(),
meta: Object.fromEntries(
Object.entries(bkMeta).map(
([k, v]) => [`buildkite${k[0].toUpperCase()}${k.slice(1)}`, v] as const
)
),
},
]);
}
} finally {
await CiStatsReporter.fromEnv(log).metrics([
{
group: 'github api request count',
id: `failed test reporter`,
value: githubApi.getRequestCount(),
meta: Object.fromEntries(
Object.entries(bkMeta).map(
([k, v]) => [`buildkite${k[0].toUpperCase()}${k.slice(1)}`, v] as const
)
),
},
{
description: `a cli that opens issues or updates existing issues based on junit reports`,
flags: {
boolean: ['github-update', 'report-update'],
string: ['build-url'],
default: {
'github-update': true,
'report-update': true,
'index-errors': true,
'build-url': process.env.BUILD_URL,
},
]);
}
},
{
description: `a cli that opens issues or updates existing issues based on junit reports`,
flags: {
boolean: ['github-update', 'report-update'],
string: ['build-url'],
default: {
'github-update': true,
'report-update': true,
'index-errors': true,
'build-url': process.env.BUILD_URL,
},
help: `
help: `
--no-github-update Execute the CLI without writing to Github
--no-report-update Execute the CLI without writing to the JUnit reports
--no-index-errors Execute the CLI without indexing failures into Elasticsearch
--build-url URL of the failed build, defaults to process.env.BUILD_URL
`,
},
}
);
},
}
);
}
Loading
Loading