Auto rerun transient CI failures #8091
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Analyzes failed CI PR runs for retry-safe transient failures and requests reruns for | |
| # the matched jobs through GitHub's job-rerun API, which also reruns dependent jobs. | |
| # For the supported behaviors and safety rails, see | |
| # docs/ci/auto-rerun-transient-ci-failures.md. | |
| name: Auto rerun transient CI failures | |
| on: | |
| workflow_run: | |
| workflows: ["CI"] | |
| types: | |
| - completed | |
| workflow_dispatch: | |
| inputs: | |
| run_id: | |
| description: 'CI workflow run ID to inspect' | |
| required: true | |
| type: number | |
| dry_run: | |
| description: 'Inspect and summarize without requesting reruns' | |
| required: false | |
| default: false | |
| type: boolean | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && inputs.run_id || github.event.workflow_run.id }} | |
| cancel-in-progress: false | |
| jobs: | |
| analyze-transient-failures: | |
| name: Analyze transient CI failures | |
| if: >- | |
| ${{ | |
| github.repository_owner == 'microsoft' && | |
| (github.event_name == 'workflow_dispatch' || | |
| (github.event.workflow_run.event == 'pull_request' && | |
| github.event.workflow_run.conclusion == 'failure' && | |
| github.event.workflow_run.run_attempt <= 3)) | |
| }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: read | |
| checks: read | |
| contents: read | |
| outputs: | |
| source_run_id: ${{ steps.analyze.outputs.source_run_id }} | |
| source_run_attempt: ${{ steps.analyze.outputs.source_run_attempt }} | |
| source_run_url: ${{ steps.analyze.outputs.source_run_url }} | |
| retryable_jobs: ${{ steps.analyze.outputs.retryable_jobs }} | |
| pull_request_numbers: ${{ steps.analyze.outputs.pull_request_numbers }} | |
| retryable_count: ${{ steps.analyze.outputs.retryable_count }} | |
| skipped_count: ${{ steps.analyze.outputs.skipped_count }} | |
| rerun_eligible: ${{ steps.analyze.outputs.rerun_eligible }} | |
| rerun_execution_eligible: ${{ steps.analyze.outputs.rerun_execution_eligible }} | |
| dry_run: ${{ steps.analyze.outputs.dry_run }} | |
| max_retryable_jobs: ${{ steps.analyze.outputs.max_retryable_jobs }} | |
| test_pattern_matched_tests: ${{ steps.analyze.outputs.test_pattern_matched_tests }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Analyze failed jobs | |
| id: analyze | |
| uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| MANUAL_RUN_ID: ${{ inputs.run_id }} | |
| MANUAL_DRY_RUN: ${{ inputs.dry_run }} | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const path = require('node:path'); | |
| const fs = require('node:fs'); | |
| const { execSync } = require('node:child_process'); | |
| const os = require('node:os'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const isWorkflowDispatch = context.eventName === 'workflow_dispatch'; | |
| const maxRetryableJobs = rerunWorkflow.defaultMaxRetryableJobs; | |
| const maxJobLogInspectionBytes = 256 * 1024; | |
| async function paginate(route, parameters, selectItems) { | |
| const items = []; | |
| for (let page = 1; ; page++) { | |
| const response = await github.request(route, { | |
| ...parameters, | |
| per_page: 100, | |
| page, | |
| }); | |
| items.push(...selectItems(response.data)); | |
| if (!response.headers.link || !response.headers.link.includes('rel="next"')) { | |
| return items; | |
| } | |
| } | |
| } | |
| async function getWorkflowRun() { | |
| if (!isWorkflowDispatch) { | |
| return context.payload.workflow_run; | |
| } | |
| const runId = Number(process.env.MANUAL_RUN_ID); | |
| if (!Number.isInteger(runId) || runId <= 0) { | |
| throw new Error('workflow_dispatch requires a valid run_id input.'); | |
| } | |
| const response = await github.rest.actions.getWorkflowRun({ | |
| owner, | |
| repo, | |
| run_id: runId, | |
| }); | |
| return response.data; | |
| } | |
| function parseManualDryRun() { | |
| if (!isWorkflowDispatch) { | |
| return false; | |
| } | |
| return String(process.env.MANUAL_DRY_RUN).toLowerCase() === 'true'; | |
| } | |
| async function listJobsForAttempt(runId, attemptNumber) { | |
| return paginate( | |
| 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/attempts/{attempt_number}/jobs', | |
| { | |
| owner, | |
| repo, | |
| run_id: runId, | |
| attempt_number: attemptNumber, | |
| }, | |
| data => data.jobs || []); | |
| } | |
| async function listAnnotations(job) { | |
| try { | |
| const checkRunId = await rerunWorkflow.getCheckRunIdForJob({ | |
| job, | |
| getJobForWorkflowRun: async jobId => { | |
| const response = await github.rest.actions.getJobForWorkflowRun({ | |
| owner, | |
| repo, | |
| job_id: jobId, | |
| }); | |
| return response.data; | |
| }, | |
| }); | |
| if (!checkRunId) { | |
| core.warning(`Unable to resolve a check run id for job ${job.id}.`); | |
| return []; | |
| } | |
| return await paginate( | |
| 'GET /repos/{owner}/{repo}/check-runs/{check_run_id}/annotations', | |
| { | |
| owner, | |
| repo, | |
| check_run_id: checkRunId, | |
| }, | |
| data => Array.isArray(data) ? data : []); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to list annotations for job ${job.id}: ${error.message}`); | |
| return []; | |
| } | |
| } | |
| async function getJobLogText(jobId) { | |
| try { | |
| const response = await fetch(`https://api.github.com/repos/${owner}/${repo}/actions/jobs/${jobId}/logs`, { | |
| headers: { | |
| authorization: `Bearer ${process.env.GITHUB_TOKEN}`, | |
| accept: 'application/vnd.github+json', | |
| 'x-github-api-version': '2022-11-28', | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`HTTP ${response.status}`); | |
| } | |
| return (await response.text()).slice(-maxJobLogInspectionBytes); | |
| } | |
| catch (error) { | |
| core.warning(`Failed to fetch logs for job ${jobId}: ${error.message}`); | |
| return ''; | |
| } | |
| } | |
| const workflowRun = await getWorkflowRun(); | |
| const dryRun = parseManualDryRun(); | |
| const sourceRunUrl = workflowRun.html_url || `https://github.com/${owner}/${repo}/actions/runs/${workflowRun.id}`; | |
| core.setOutput('source_run_id', String(workflowRun.id)); | |
| core.setOutput('source_run_attempt', String(workflowRun.run_attempt || '')); | |
| core.setOutput('source_run_url', sourceRunUrl); | |
| core.setOutput('dry_run', String(dryRun)); | |
| core.setOutput('max_retryable_jobs', String(maxRetryableJobs)); | |
| core.setOutput('retryable_jobs', '[]'); | |
| core.setOutput('pull_request_numbers', '[]'); | |
| core.setOutput('retryable_count', '0'); | |
| core.setOutput('skipped_count', '0'); | |
| core.setOutput('rerun_eligible', 'false'); | |
| core.setOutput('rerun_execution_eligible', 'false'); | |
| core.setOutput('test_pattern_matched_tests', '[]'); | |
| if (workflowRun.name && workflowRun.name !== 'CI') { | |
| console.log(`Workflow run ${workflowRun.id} is '${workflowRun.name}', not 'CI'. Skipping.`); | |
| return; | |
| } | |
| const pullRequestNumbers = await rerunWorkflow.getAssociatedPullRequestNumbers({ | |
| github, | |
| owner, | |
| repo, | |
| workflowRun, | |
| warn: message => core.warning(message), | |
| }); | |
| core.setOutput('pull_request_numbers', JSON.stringify(pullRequestNumbers)); | |
| if (pullRequestNumbers.length === 0) { | |
| console.log('No associated pull request could be resolved for this workflow run. Skipping.'); | |
| return; | |
| } | |
| const runId = workflowRun.id; | |
| const runAttempt = workflowRun.run_attempt; | |
| const jobs = await listJobsForAttempt(runId, runAttempt); | |
| // Load test retry patterns config | |
| const configPath = path.join(process.env.GITHUB_WORKSPACE, 'eng', 'test-retry-patterns.json'); | |
| const { config: retryPatternsConfig, errors: configErrors } = rerunWorkflow.loadRetryPatternsConfig(configPath); | |
| if (configErrors.length > 0) { | |
| core.warning(`Test retry patterns config has errors: ${configErrors.join('; ')}`); | |
| } | |
| let { failedJobs, retryableJobs, skippedJobs } = await rerunWorkflow.analyzeFailedJobs({ | |
| jobs, | |
| getAnnotationsForJob: async job => listAnnotations(job), | |
| getJobLogTextForJob: async job => getJobLogText(job.id), | |
| maxRetryableJobs, | |
| retryPatternsConfig, | |
| }); | |
| // TRX-based analysis: check test output for transient patterns | |
| let testPatternMatchedTests = []; | |
| const hasSkippedTestExecJobs = skippedJobs.some(job => | |
| rerunWorkflow.hasTestExecutionFailureStep(job.failedSteps) | |
| ); | |
| const testFailurePatterns = retryPatternsConfig?.testFailurePatterns; | |
| if (hasSkippedTestExecJobs && Array.isArray(testFailurePatterns) && testFailurePatterns.length > 0) { | |
| try { | |
| const artifacts = await paginate( | |
| 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/artifacts', | |
| { owner, repo, run_id: runId }, | |
| data => data.artifacts || []); | |
| const testArtifact = rerunWorkflow.selectTestResultsArtifact(artifacts); | |
| if (testArtifact) { | |
| console.log(`Downloading test results artifact '${testArtifact.name}' (${testArtifact.size_in_bytes} bytes)...`); | |
| const download = await github.rest.actions.downloadArtifact({ | |
| owner, | |
| repo, | |
| artifact_id: testArtifact.id, | |
| archive_format: 'zip', | |
| }); | |
| const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-results-')); | |
| try { | |
| const zipPath = path.join(tmpDir, 'test-results.zip'); | |
| fs.writeFileSync(zipPath, Buffer.from(download.data)); | |
| const trxDir = path.join(tmpDir, 'trx'); | |
| fs.mkdirSync(trxDir, { recursive: true }); | |
| execSync(`unzip -qo "${zipPath}" -d "${trxDir}"`, { timeout: 30_000 }); | |
| const trxFileContents = []; | |
| const maxTrxFiles = 200; | |
| const maxTrxFileBytes = 50 * 1024 * 1024; // 50MB per file cap | |
| const resolvedTrxDir = fs.realpathSync(trxDir); | |
| const findTrxFiles = (dir) => { | |
| for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { | |
| if (entry.isSymbolicLink()) { | |
| continue; | |
| } | |
| const fullPath = path.join(dir, entry.name); | |
| const resolvedPath = fs.realpathSync(fullPath); | |
| if (!resolvedPath.startsWith(resolvedTrxDir + path.sep) && resolvedPath !== resolvedTrxDir) { | |
| continue; | |
| } | |
| if (entry.isDirectory()) { | |
| findTrxFiles(fullPath); | |
| } else if (entry.name.endsWith('.trx') && trxFileContents.length < maxTrxFiles) { | |
| const stat = fs.statSync(fullPath); | |
| if (stat.size <= maxTrxFileBytes) { | |
| trxFileContents.push({ | |
| fileName: entry.name, | |
| content: fs.readFileSync(fullPath, 'utf8'), | |
| }); | |
| } | |
| } | |
| } | |
| }; | |
| findTrxFiles(trxDir); | |
| if (trxFileContents.length > 0) { | |
| const { allMatchedTests } = rerunWorkflow.analyzeTrxFiles(trxFileContents, testFailurePatterns); | |
| if (allMatchedTests.length > 0) { | |
| console.log(`Found ${allMatchedTests.length} test(s) matching transient failure patterns.`); | |
| const promoted = rerunWorkflow.promoteTestExecutionFailureJobs(retryableJobs, skippedJobs, allMatchedTests); | |
| retryableJobs = promoted.retryableJobs; | |
| skippedJobs = promoted.skippedJobs; | |
| testPatternMatchedTests = allMatchedTests; | |
| } | |
| } | |
| } finally { | |
| fs.rmSync(tmpDir, { recursive: true, force: true }); | |
| } | |
| } | |
| } catch (trxError) { | |
| core.warning(`TRX analysis failed (non-fatal): ${trxError.message}`); | |
| } | |
| } | |
| core.setOutput('retryable_jobs', JSON.stringify(retryableJobs.map(job => ({ | |
| id: job.id, | |
| name: job.name, | |
| htmlUrl: job.htmlUrl, | |
| reason: job.reason, | |
| })))); | |
| core.setOutput('retryable_count', String(retryableJobs.length)); | |
| core.setOutput('skipped_count', String(skippedJobs.length)); | |
| const rerunEligible = rerunWorkflow.computeRerunEligibility({ | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| const rerunExecutionEligible = rerunWorkflow.computeRerunExecutionEligibility({ | |
| dryRun, | |
| retryableCount: retryableJobs.length, | |
| maxRetryableJobs, | |
| runAttempt, | |
| }); | |
| core.setOutput('rerun_eligible', String(rerunEligible)); | |
| core.setOutput('rerun_execution_eligible', String(rerunExecutionEligible)); | |
| core.setOutput('test_pattern_matched_tests', JSON.stringify(testPatternMatchedTests.slice(0, 50).map(t => ({ | |
| testName: t.testName, | |
| reason: t.reason, | |
| })))); | |
| await rerunWorkflow.writeAnalysisSummary({ | |
| summary: core.summary, | |
| failedJobs, | |
| retryableJobs, | |
| skippedJobs, | |
| maxRetryableJobs, | |
| dryRun, | |
| rerunEligible, | |
| sourceRunUrl, | |
| sourceRunAttempt: runAttempt, | |
| testPatternMatchedTests, | |
| }); | |
| if (retryableJobs.length === 0) { | |
| console.log('No retryable failed jobs were detected.'); | |
| return; | |
| } | |
| rerun-transient-failures: | |
| name: Rerun transient CI failures | |
| needs: [analyze-transient-failures] | |
| if: ${{ needs.analyze-transient-failures.outputs.rerun_execution_eligible == 'true' }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| contents: read | |
| issues: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Rerun matched jobs | |
| uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 | |
| env: | |
| RETRYABLE_JOBS: ${{ needs.analyze-transient-failures.outputs.retryable_jobs }} | |
| PULL_REQUEST_NUMBERS: ${{ needs.analyze-transient-failures.outputs.pull_request_numbers }} | |
| SOURCE_RUN_ID: ${{ needs.analyze-transient-failures.outputs.source_run_id }} | |
| SOURCE_RUN_ATTEMPT: ${{ needs.analyze-transient-failures.outputs.source_run_attempt }} | |
| SOURCE_RUN_URL: ${{ needs.analyze-transient-failures.outputs.source_run_url }} | |
| TEST_PATTERN_MATCHED_TESTS: ${{ needs.analyze-transient-failures.outputs.test_pattern_matched_tests }} | |
| with: | |
| script: | | |
| const rerunWorkflow = require('./.github/workflows/auto-rerun-transient-ci-failures.js'); | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const retryableJobs = JSON.parse(process.env.RETRYABLE_JOBS || '[]'); | |
| const pullRequestNumbers = JSON.parse(process.env.PULL_REQUEST_NUMBERS || '[]'); | |
| const sourceRunId = Number(process.env.SOURCE_RUN_ID); | |
| const sourceRunAttempt = Number(process.env.SOURCE_RUN_ATTEMPT); | |
| const sourceRunUrl = process.env.SOURCE_RUN_URL; | |
| const testPatternMatchedTests = JSON.parse(process.env.TEST_PATTERN_MATCHED_TESTS || '[]'); | |
| if (retryableJobs.length === 0) { | |
| console.log('No retryable jobs were provided to the rerun job.'); | |
| return; | |
| } | |
| await rerunWorkflow.rerunMatchedJobs({ | |
| github, | |
| owner, | |
| repo, | |
| retryableJobs, | |
| pullRequestNumbers, | |
| summary: core.summary, | |
| sourceRunId, | |
| sourceRunAttempt, | |
| sourceRunUrl, | |
| testPatternMatchedTests, | |
| }); |