From 65753be810d5f789b72f18db739792d8f6614072 Mon Sep 17 00:00:00 2001
From: Noah Martin <noahm444@gmail.com>
Date: Wed, 25 Jun 2025 09:24:20 -0400
Subject: [PATCH] Test workflow

---
 .github/workflows/flaky-test-detector.yml | 237 ++++++++++++++++++++++
 scripts/analyze-flaky-tests.py            | 210 +++++++++++++++++++
 2 files changed, 447 insertions(+)
 create mode 100644 .github/workflows/flaky-test-detector.yml
 create mode 100644 scripts/analyze-flaky-tests.py

diff --git a/.github/workflows/flaky-test-detector.yml b/.github/workflows/flaky-test-detector.yml
new file mode 100644
index 00000000000..96a507f055f
--- /dev/null
+++ b/.github/workflows/flaky-test-detector.yml
@@ -0,0 +1,237 @@
+name: Flaky Test Detector
+on:
+  pull_request:
+    branches: [ main, master, develop ]
+  workflow_dispatch:
+    inputs:
+      test_runs:
+        description: 'Number of test runs to perform'
+        required: true
+        default: '5'
+        type: string
+      branch:
+        description: 'Branch to test (defaults to current branch)'
+        required: false
+        type: string
+  schedule:
+    # Run weekly on Sundays at 2 AM UTC
+    - cron: '0 2 * * 0'
+
+# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build-test-server:
+    name: Build test server
+    runs-on: macos-15
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }}
+      
+      - name: Cache for Test Server
+        id: cache_test_server
+        uses: actions/cache@v4
+        with:
+          path: ./test-server/.build
+          key: test-server-${{ hashFiles('./test-server') }}
+          restore-keys: |
+            test-server-${{ hashFiles('./test-server') }}
+            test-server-
+
+      - name: Build Test Server
+        if: steps.cache_test_server.outputs.cache-hit != 'true'
+        working-directory: test-server
+        run: >-
+          swift build -c release 2>&1 | tee test-server-build.log
+
+      - name: Copy exec
+        working-directory: test-server
+        run: cp $(swift build --show-bin-path -c release)/Run test-server-exec
+
+      - name: Archiving DerivedData
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-server
+          path: |
+            ./test-server/test-server-exec
+
+  flaky-test-detector:
+    name: Flaky Test Detector - iOS 18.2 Xcode 16.2
+    runs-on: macos-15
+    timeout-minutes: 60
+    needs: build-test-server
+    strategy:
+      matrix:
+        # Default to 5 runs, but can be overridden via workflow_dispatch
+        run_number: [1, 2, 3, 4, 5]
+        # For manual runs, we'll use a different approach to handle custom run counts
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }}
+      
+      - uses: actions/download-artifact@v4
+        with:
+          name: test-server
+
+      - name: Print hardware info
+        run: system_profiler SPHardwareDataType
+
+      - name: Allow test-server to run
+        run: chmod +x ./test-server-exec
+      - run: ./test-server-exec &
+
+      - name: Check test-server runs
+        run: curl http://localhost:8080/echo-baggage-header
+
+      - run: ./scripts/ci-select-xcode.sh 16.2
+
+      - name: Install Slather
+        run: gem install slather
+
+      # Build tests once for all runs
+      - name: Build tests
+        id: build_tests
+        run: |
+          ./scripts/sentry-xcodebuild.sh \
+            --platform iOS \
+            --os 18.2 \
+            --ref ${{ github.ref_name }} \
+            --command build-for-testing \
+            --device "iPhone 16" \
+            --configuration TestCI \
+            --scheme Sentry
+
+      - name: Run tests (Run ${{ matrix.run_number }})
+        id: run_tests
+        run: |
+          ./scripts/sentry-xcodebuild.sh \
+            --platform iOS \
+            --os 18.2 \
+            --ref ${{ github.ref_name }} \
+            --command test-without-building \
+            --device "iPhone 16" \
+            --configuration TestCI \
+            --scheme Sentry
+
+      - name: Archive test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: test-results-run-${{ matrix.run_number }}
+          path: |
+            build/reports/junit.xml
+            raw-test-output.log
+
+      - name: Archive logs on failure
+        uses: actions/upload-artifact@v4
+        if: ${{ failure() || cancelled() }}
+        with:
+          name: logs-run-${{ matrix.run_number }}
+          path: |
+            raw-build-output.log
+            raw-build-for-testing-output.log
+            raw-test-output.log
+
+  analyze-flaky-tests:
+    name: Analyze Flaky Tests
+    runs-on: macos-15
+    needs: [build-test-server, flaky-test-detector]
+    if: always()
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch || github.event.pull_request.head.ref || github.ref }}
+
+      - name: Download all test results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-results-run-*
+
+      - name: Run flaky test analysis
+        run: python3 scripts/analyze-flaky-tests.py --verbose
+
+      - name: Upload flaky test report
+        uses: actions/upload-artifact@v4
+        with:
+          name: flaky-test-analysis
+          path: |
+            flaky_tests_report.json
+            scripts/analyze-flaky-tests.py
+
+      - name: Comment on PR with results
+        if: github.event_name == 'workflow_dispatch' && github.event.inputs.branch || github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let report;
+            try {
+              report = JSON.parse(fs.readFileSync('flaky_tests_report.json', 'utf8'));
+            } catch (e) {
+              console.log('Could not read flaky test report');
+              return;
+            }
+            
+            const { flaky_tests, summary } = report;
+            
+            let comment = `## Flaky Test Analysis Results\n\n`;
+            comment += `**Test Configuration:** iOS 18.2, Xcode 16.2, iPhone 16\n`;
+            comment += `**Total Runs:** ${report.total_runs}\n\n`;
+            
+            if (flaky_tests.length === 0) {
+              comment += `🎉 **No flaky tests found!**\n\n`;
+              comment += `All tests passed consistently across ${report.total_runs} runs.`;
+            } else {
+              comment += `⚠️ **Found ${flaky_tests.length} flaky tests**\n\n`;
+              comment += `**Summary:**\n`;
+              comment += `- Highly flaky (≥50% failure rate): ${summary.highly_flaky_tests}\n`;
+              comment += `- Moderately flaky (20-49% failure rate): ${summary.moderately_flaky_tests}\n`;
+              comment += `- Slightly flaky (<20% failure rate): ${flaky_tests.length - summary.highly_flaky_tests - summary.moderately_flaky_tests}\n\n`;
+              
+              comment += `**Top 10 Most Flaky Tests:**\n`;
+              flaky_tests.slice(0, 10).forEach((test, index) => {
+                comment += `${index + 1}. \`${test.test_name}\` - ${(test.flakiness_rate * 100).toFixed(1)}% failure rate (${test.failed}/${test.total_runs})\n`;
+              });
+              
+              if (flaky_tests.length > 10) {
+                comment += `\n... and ${flaky_tests.length - 10} more flaky tests. See the full report in the artifacts.`;
+              }
+            }
+            
+            // Try to comment on the PR
+            try {
+              let pullRequestNumber;
+              
+              if (context.eventName === 'pull_request') {
+                // For pull_request events, use the PR number directly
+                pullRequestNumber = context.payload.pull_request.number;
+              } else if (context.eventName === 'workflow_dispatch' && context.payload.inputs.branch) {
+                // For workflow_dispatch events, find the PR by branch
+                const { data: pulls } = await github.rest.pulls.list({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  head: context.payload.inputs.branch,
+                  state: 'open'
+                });
+                
+                if (pulls.length > 0) {
+                  pullRequestNumber = pulls[0].number;
+                }
+              }
+              
+              if (pullRequestNumber) {
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: pullRequestNumber,
+                  body: comment
+                });
+              }
+            } catch (e) {
+              console.log('Could not comment on PR:', e.message);
+            } 
diff --git a/scripts/analyze-flaky-tests.py b/scripts/analyze-flaky-tests.py
new file mode 100644
index 00000000000..58e75a7c861
--- /dev/null
+++ b/scripts/analyze-flaky-tests.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Flaky Test Analyzer
+
+This script analyzes JUnit XML test results from multiple test runs to identify
+flaky tests. A test is considered flaky if it both passes and fails across
+different runs.
+
+Usage:
+    python3 analyze-flaky-tests.py [--runs N] [--output-file path]
+"""
+
+import sys
+import xml.etree.ElementTree as ET
+import glob
+import json
+import argparse
+from collections import defaultdict
+from typing import Dict, List, Tuple, Any
+
+
+def parse_junit_xml(xml_file: str) -> Dict[str, str]:
+    """Parse JUnit XML and return test results."""
+    try:
+        tree = ET.parse(xml_file)
+        root = tree.getroot()
+        
+        # Handle both JUnit 4 and JUnit 5 formats
+        testsuites = root.findall('.//testsuite') or [root]
+        
+        results = {}
+        for testsuite in testsuites:
+            for testcase in testsuite.findall('.//testcase'):
+                class_name = testcase.get('classname', 'Unknown')
+                test_name = testcase.get('name', 'Unknown')
+                full_test_name = f"{class_name}.{test_name}"
+                
+                # Check if test failed
+                failure = testcase.find('.//failure')
+                error = testcase.find('.//error')
+                skipped = testcase.find('.//skipped')
+                
+                if skipped is not None:
+                    status = 'skipped'
+                elif failure is not None or error is not None:
+                    status = 'failed'
+                else:
+                    status = 'passed'
+                
+                results[full_test_name] = status
+        
+        return results
+    except Exception as e:
+        print(f"Error parsing {xml_file}: {e}", file=sys.stderr)
+        return {}
+
+
+def analyze_flaky_tests(
+    result_pattern: str = 'test-results-run-*/junit.xml'
+) -> Tuple[List[Dict[str, Any]], int]:
+    """Analyze test results to find flaky tests."""
+    test_results = defaultdict(list)
+    
+    # Find all test result files
+    result_files = glob.glob(result_pattern)
+    result_files.sort()
+    
+    if not result_files:
+        print(f"No test result files found matching pattern: {result_pattern}",
+              file=sys.stderr)
+        return [], 0
+    
+    print(f"Found {len(result_files)} test result files")
+    
+    for i, xml_file in enumerate(result_files, 1):
+        print(f"Processing run {i}: {xml_file}")
+        
+        results = parse_junit_xml(xml_file)
+        for test_name, status in results.items():
+            test_results[test_name].append(status)
+    
+    # Analyze flaky tests
+    flaky_tests = []
+    total_runs = len(result_files)
+    
+    for test_name, statuses in test_results.items():
+        if len(statuses) != total_runs:
+            print(f"Warning: Test {test_name} has {len(statuses)} results "
+                  f"for {total_runs} runs", file=sys.stderr)
+            continue
+        
+        passed_count = statuses.count('passed')
+        failed_count = statuses.count('failed')
+        skipped_count = statuses.count('skipped')
+        
+        # A test is flaky if it both passed and failed (not just skipped)
+        if passed_count > 0 and failed_count > 0:
+            flakiness_rate = failed_count / total_runs
+            flaky_tests.append({
+                'test_name': test_name,
+                'passed': passed_count,
+                'failed': failed_count,
+                'skipped': skipped_count,
+                'total_runs': total_runs,
+                'flakiness_rate': flakiness_rate,
+                'statuses': statuses
+            })
+    
+    # Sort by flakiness rate (highest first)
+    flaky_tests.sort(key=lambda x: x['flakiness_rate'], reverse=True)
+    
+    return flaky_tests, total_runs
+
+
+def generate_summary(flaky_tests: List[Dict[str, Any]]) -> Dict[str, int]:
+    """Generate summary statistics for flaky tests."""
+    if not flaky_tests:
+        return {
+            'total_flaky_tests': 0,
+            'highly_flaky_tests': 0,
+            'moderately_flaky_tests': 0,
+            'slightly_flaky_tests': 0
+        }
+    
+    highly_flaky = len([t for t in flaky_tests if t['flakiness_rate'] >= 0.5])
+    moderately_flaky = len([t for t in flaky_tests 
+                           if 0.2 <= t['flakiness_rate'] < 0.5])
+    slightly_flaky = len([t for t in flaky_tests if t['flakiness_rate'] < 0.2])
+    
+    return {
+        'total_flaky_tests': len(flaky_tests),
+        'highly_flaky_tests': highly_flaky,
+        'moderately_flaky_tests': moderately_flaky,
+        'slightly_flaky_tests': slightly_flaky
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Analyze flaky tests from JUnit XML results'
+    )
+    parser.add_argument(
+        '--pattern', 
+        default='test-results-run-*/junit.xml',
+        help='Glob pattern for test result files '
+             '(default: test-results-run-*/junit.xml)'
+    )
+    parser.add_argument(
+        '--output-file', 
+        default='flaky_tests_report.json',
+        help='Output file for detailed report '
+             '(default: flaky_tests_report.json)'
+    )
+    parser.add_argument(
+        '--verbose', '-v', 
+        action='store_true',
+        help='Verbose output'
+    )
+    
+    args = parser.parse_args()
+    
+    flaky_tests, total_runs = analyze_flaky_tests(args.pattern)
+    summary = generate_summary(flaky_tests)
+    
+    print("\n=== Flaky Test Analysis ===")
+    print(f"Total test runs: {total_runs}")
+    print(f"Flaky tests found: {len(flaky_tests)}")
+    print()
+    
+    if flaky_tests:
+        print("Flaky Tests (sorted by flakiness rate):")
+        print("-" * 80)
+        for test in flaky_tests:
+            print(f"Test: {test['test_name']}")
+            print(f"  Flakiness Rate: {test['flakiness_rate']:.1%} "
+                  f"({test['failed']}/{test['total_runs']} failed)")
+            if args.verbose:
+                print(f"  Results: {' '.join(test['statuses'])}")
+            print()
+        
+        print("Summary:")
+        print(f"  Highly flaky (≥50% failure rate): {summary['highly_flaky_tests']}")
+        print(f"  Moderately flaky (20-49% failure rate): "
+              f"{summary['moderately_flaky_tests']}")
+        print(f"  Slightly flaky (<20% failure rate): "
+              f"{summary['slightly_flaky_tests']}")
+    else:
+        print("No flaky tests found! 🎉")
+    
+    # Save detailed results to file
+    report = {
+        'total_runs': total_runs,
+        'flaky_tests': flaky_tests,
+        'summary': summary
+    }
+    
+    with open(args.output_file, 'w') as f:
+        json.dump(report, f, indent=2)
+    
+    print(f"\nDetailed report saved to {args.output_file}")
+    
+    # Exit with error code if flaky tests found
+    if flaky_tests:
+        sys.exit(1)
+    else:
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()