From 65753be810d5f789b72f18db739792d8f6614072 Mon Sep 17 00:00:00 2001 From: Noah Martin Date: Wed, 25 Jun 2025 09:24:20 -0400 Subject: [PATCH] Test workflow --- .github/workflows/flaky-test-detector.yml | 237 ++++++++++++++++++++++ scripts/analyze-flaky-tests.py | 210 +++++++++++++++++++ 2 files changed, 447 insertions(+) create mode 100644 .github/workflows/flaky-test-detector.yml create mode 100644 scripts/analyze-flaky-tests.py diff --git a/.github/workflows/flaky-test-detector.yml b/.github/workflows/flaky-test-detector.yml new file mode 100644 index 00000000000..96a507f055f --- /dev/null +++ b/.github/workflows/flaky-test-detector.yml @@ -0,0 +1,237 @@ +name: Flaky Test Detector +on: + pull_request: + branches: [ main, master, develop ] + workflow_dispatch: + inputs: + test_runs: + description: 'Number of test runs to perform' + required: true + default: '5' + type: string + branch: + description: 'Branch to test (defaults to current branch)' + required: false + type: string + schedule: + # Run weekly on Sundays at 2 AM UTC + - cron: '0 2 * * 0' + +# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build-test-server: + name: Build test server + runs-on: macos-15 + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }} + + - name: Cache for Test Server + id: cache_test_server + uses: actions/cache@v4 + with: + path: ./test-server/.build + key: test-server-${{ hashFiles('./test-server') }} + restore-keys: | + test-server-${{ hashFiles('./test-server') }} + test-server- + + - name: Build Test Server + if: steps.cache_test_server.outputs.cache-hit != 'true' + working-directory: test-server + run: >- + swift build -c release 2>&1 | tee test-server-build.log + + - name: Copy exec + working-directory: test-server + run: cp $(swift build --show-bin-path -c release)/Run test-server-exec + + - name: Archiving DerivedData + uses: actions/upload-artifact@v4 + with: + name: test-server + path: | + ./test-server/test-server-exec + + flaky-test-detector: + name: Flaky Test Detector - iOS 18.2 Xcode 16.2 + runs-on: macos-15 + timeout-minutes: 60 + needs: build-test-server + strategy: + matrix: + # Default to 5 runs, but can be overridden via workflow_dispatch + run_number: [1, 2, 3, 4, 5] + # For manual runs, we'll use a different approach to handle custom run counts + + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }} + + - uses: actions/download-artifact@v4 + with: + name: test-server + + - name: Print hardware info + run: system_profiler SPHardwareDataType + + - name: Allow test-server to run + run: chmod +x ./test-server-exec + - run: ./test-server-exec & + + - name: Check test-server runs + run: curl http://localhost:8080/echo-baggage-header + + - run: ./scripts/ci-select-xcode.sh 16.2 + + - name: Install Slather + run: gem install slather + + # Build tests once for all runs + - name: Build tests + id: build_tests + run: | + ./scripts/sentry-xcodebuild.sh \ + --platform iOS \ + --os 18.2 \ + --ref ${{ github.ref_name }} \ + --command build-for-testing \ + --device "iPhone 16" \ + --configuration TestCI \ + --scheme Sentry + + - name: Run tests (Run ${{ matrix.run_number }}) + id: run_tests + run: | + ./scripts/sentry-xcodebuild.sh \ + --platform iOS \ + --os 18.2 \ + --ref ${{ github.ref_name }} \ + --command test-without-building \ + --device "iPhone 16" \ + --configuration TestCI \ + --scheme Sentry + + - name: Archive test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: test-results-run-${{ matrix.run_number }} + path: | + build/reports/junit.xml + raw-test-output.log + + - name: Archive logs on failure + uses: actions/upload-artifact@v4 + if: ${{ failure() || cancelled() }} + with: + name: logs-run-${{ matrix.run_number }} + path: | + raw-build-output.log + raw-build-for-testing-output.log + raw-test-output.log + + analyze-flaky-tests: + name: Analyze Flaky Tests + runs-on: macos-15 + needs: [build-test-server, flaky-test-detector] + if: always() + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }} + + - name: Download all test results + uses: actions/download-artifact@v4 + with: + pattern: test-results-run-* + + - name: Run flaky test analysis + run: python3 scripts/analyze-flaky-tests.py --verbose + + - name: Upload flaky test report + uses: actions/upload-artifact@v4 + with: + name: flaky-test-analysis + path: | + flaky_tests_report.json + scripts/analyze-flaky-tests.py + + - name: Comment on PR with results + if: github.event_name == 'workflow_dispatch' && github.event.inputs.branch || github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let report; + try { + report = JSON.parse(fs.readFileSync('flaky_tests_report.json', 'utf8')); + } catch (e) { + console.log('Could not read flaky test report'); + return; + } + + const { flaky_tests, summary } = report; + + let comment = `## Flaky Test Analysis Results\n\n`; + comment += `**Test Configuration:** iOS 18.2, Xcode 16.2, iPhone 16\n`; + comment += `**Total Runs:** ${report.total_runs}\n\n`; + + if (flaky_tests.length === 0) { + comment += `🎉 **No flaky tests found!**\n\n`; + comment += `All tests passed consistently across ${report.total_runs} runs.`; + } else { + comment += `⚠️ **Found ${flaky_tests.length} flaky tests**\n\n`; + comment += `**Summary:**\n`; + comment += `- Highly flaky (≥50% failure rate): ${summary.highly_flaky_tests}\n`; + comment += `- Moderately flaky (20-49% failure rate): ${summary.moderately_flaky_tests}\n`; + comment += `- Slightly flaky (<20% failure rate): ${flaky_tests.length - summary.highly_flaky_tests - summary.moderately_flaky_tests}\n\n`; + + comment += `**Top 10 Most Flaky Tests:**\n`; + flaky_tests.slice(0, 10).forEach((test, index) => { + comment += `${index + 1}. \`${test.test_name}\` - ${(test.flakiness_rate * 100).toFixed(1)}% failure rate (${test.failed}/${test.total_runs})\n`; + }); + + if (flaky_tests.length > 10) { + comment += `\n... and ${flaky_tests.length - 10} more flaky tests. See the full report in the artifacts.`; + } + } + + // Try to comment on the PR + try { + let pullRequestNumber; + + if (context.eventName === 'pull_request') { + // For pull_request events, use the PR number directly + pullRequestNumber = context.payload.pull_request.number; + } else if (context.eventName === 'workflow_dispatch' && context.payload.inputs.branch) { + // For workflow_dispatch events, find the PR by branch + const { data: pulls } = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + head: context.payload.inputs.branch, + state: 'open' + }); + + if (pulls.length > 0) { + pullRequestNumber = pulls[0].number; + } + } + + if (pullRequestNumber) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pullRequestNumber, + body: comment + }); + } + } catch (e) { + console.log('Could not comment on PR:', e.message); + } diff --git a/scripts/analyze-flaky-tests.py b/scripts/analyze-flaky-tests.py new file mode 100644 index 00000000000..58e75a7c861 --- /dev/null +++ b/scripts/analyze-flaky-tests.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Flaky Test Analyzer + +This script analyzes JUnit XML test results from multiple test runs to identify +flaky tests. A test is considered flaky if it both passes and fails across +different runs. + +Usage: + python3 analyze-flaky-tests.py [--runs N] [--output-file path] +""" + +import sys +import xml.etree.ElementTree as ET +import glob +import json +import argparse +from collections import defaultdict +from typing import Dict, List, Tuple, Any + + +def parse_junit_xml(xml_file: str) -> Dict[str, str]: + """Parse JUnit XML and return test results.""" + try: + tree = ET.parse(xml_file) + root = tree.getroot() + + # Handle both JUnit 4 and JUnit 5 formats + testsuites = root.findall('.//testsuite') or [root] + + results = {} + for testsuite in testsuites: + for testcase in testsuite.findall('.//testcase'): + class_name = testcase.get('classname', 'Unknown') + test_name = testcase.get('name', 'Unknown') + full_test_name = f"{class_name}.{test_name}" + + # Check if test failed + failure = testcase.find('.//failure') + error = testcase.find('.//error') + skipped = testcase.find('.//skipped') + + if skipped is not None: + status = 'skipped' + elif failure is not None or error is not None: + status = 'failed' + else: + status = 'passed' + + results[full_test_name] = status + + return results + except Exception as e: + print(f"Error parsing {xml_file}: {e}", file=sys.stderr) + return {} + + +def analyze_flaky_tests( + result_pattern: str = 'test-results-run-*/junit.xml' +) -> Tuple[List[Dict[str, Any]], int]: + """Analyze test results to find flaky tests.""" + test_results = defaultdict(list) + + # Find all test result files + result_files = glob.glob(result_pattern) + result_files.sort() + + if not result_files: + print(f"No test result files found matching pattern: {result_pattern}", + file=sys.stderr) + return [], 0 + + print(f"Found {len(result_files)} test result files") + + for i, xml_file in enumerate(result_files, 1): + print(f"Processing run {i}: {xml_file}") + + results = parse_junit_xml(xml_file) + for test_name, status in results.items(): + test_results[test_name].append(status) + + # Analyze flaky tests + flaky_tests = [] + total_runs = len(result_files) + + for test_name, statuses in test_results.items(): + if len(statuses) != total_runs: + print(f"Warning: Test {test_name} has {len(statuses)} results " + f"for {total_runs} runs", file=sys.stderr) + continue + + passed_count = statuses.count('passed') + failed_count = statuses.count('failed') + skipped_count = statuses.count('skipped') + + # A test is flaky if it both passed and failed (not just skipped) + if passed_count > 0 and failed_count > 0: + flakiness_rate = failed_count / total_runs + flaky_tests.append({ + 'test_name': test_name, + 'passed': passed_count, + 'failed': failed_count, + 'skipped': skipped_count, + 'total_runs': total_runs, + 'flakiness_rate': flakiness_rate, + 'statuses': statuses + }) + + # Sort by flakiness rate (highest first) + flaky_tests.sort(key=lambda x: x['flakiness_rate'], reverse=True) + + return flaky_tests, total_runs + + +def generate_summary(flaky_tests: List[Dict[str, Any]]) -> Dict[str, int]: + """Generate summary statistics for flaky tests.""" + if not flaky_tests: + return { + 'total_flaky_tests': 0, + 'highly_flaky_tests': 0, + 'moderately_flaky_tests': 0, + 'slightly_flaky_tests': 0 + } + + highly_flaky = len([t for t in flaky_tests if t['flakiness_rate'] >= 0.5]) + moderately_flaky = len([t for t in flaky_tests + if 0.2 <= t['flakiness_rate'] < 0.5]) + slightly_flaky = len([t for t in flaky_tests if t['flakiness_rate'] < 0.2]) + + return { + 'total_flaky_tests': len(flaky_tests), + 'highly_flaky_tests': highly_flaky, + 'moderately_flaky_tests': moderately_flaky, + 'slightly_flaky_tests': slightly_flaky + } + + +def main(): + parser = argparse.ArgumentParser( + description='Analyze flaky tests from JUnit XML results' + ) + parser.add_argument( + '--pattern', + default='test-results-run-*/junit.xml', + help='Glob pattern for test result files ' + '(default: test-results-run-*/junit.xml)' + ) + parser.add_argument( + '--output-file', + default='flaky_tests_report.json', + help='Output file for detailed report ' + '(default: flaky_tests_report.json)' + ) + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Verbose output' + ) + + args = parser.parse_args() + + flaky_tests, total_runs = analyze_flaky_tests(args.pattern) + summary = generate_summary(flaky_tests) + + print("\n=== Flaky Test Analysis ===") + print(f"Total test runs: {total_runs}") + print(f"Flaky tests found: {len(flaky_tests)}") + print() + + if flaky_tests: + print("Flaky Tests (sorted by flakiness rate):") + print("-" * 80) + for test in flaky_tests: + print(f"Test: {test['test_name']}") + print(f" Flakiness Rate: {test['flakiness_rate']:.1%} " + f"({test['failed']}/{test['total_runs']} failed)") + if args.verbose: + print(f" Results: {' '.join(test['statuses'])}") + print() + + print("Summary:") + print(f" Highly flaky (≥50% failure rate): {summary['highly_flaky_tests']}") + print(f" Moderately flaky (20-49% failure rate): " + f"{summary['moderately_flaky_tests']}") + print(f" Slightly flaky (<20% failure rate): " + f"{summary['slightly_flaky_tests']}") + else: + print("No flaky tests found! 🎉") + + # Save detailed results to file + report = { + 'total_runs': total_runs, + 'flaky_tests': flaky_tests, + 'summary': summary + } + + with open(args.output_file, 'w') as f: + json.dump(report, f, indent=2) + + print(f"\nDetailed report saved to {args.output_file}") + + # Exit with error code if flaky tests found + if flaky_tests: + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == "__main__": + main()