feat: richer flake scoring (flips/instability/streaks); add vitest/go/dotnet/shell runners

Cicatriiz · Cicatriiz · commit 833c21cea7c4 · 2025-08-07T17:19:57.000-04:00
diff --git a/flakewall/cli.py b/flakewall/cli.py
@@ -9,7 +9,13 @@
 import typer
 
 from .config import FlakewallConfig, ensure_default_files, CONFIG_PATH
-from .junit import parse_junit_files, failing_ids, compute_flake_stats
+from .junit import (
+    parse_junit_files,
+    parse_junit_files_grouped,
+    failing_ids,
+    compute_flake_stats,
+    compute_flake_metrics,
+)
 from .quarantine import load_quarantined, add_to_quarantine
 from .runner import retry_tests
 from . import __version__
@@ -116,51 +122,105 @@ def score(
     gh_summary: bool = typer.Option(
         False, help="Write summary of flaky candidates to GITHUB_STEP_SUMMARY"
     ),
+    rich: bool = typer.Option(
+        False, help="Compute richer metrics (flips, instability, streaks) across runs"
+    ),
 ) -> None:
     """Compute minimal flake stats from a set of JUnit XML files and print tests that flipped."""
     cfg = FlakewallConfig.load()
     pattern = junit or cfg.report_glob
     files = [Path(p) for p in glob.glob(pattern, recursive=True)]
     results = parse_junit_files(files)
-    stats = compute_flake_stats(results)
-
-    flippers = [s for s in stats.values() if s.total_runs >= min_total and s.has_flip]
-    flippers.sort(key=lambda s: (-(s.fail_ratio), s.test_id))
+    if rich:
+        grouped = parse_junit_files_grouped(files)
+        metrics = compute_flake_metrics(grouped)
+        flippers = [
+            m
+            for m in metrics.values()
+            if m.total_runs >= min_total and (m.fail_error_count > 0 and m.pass_count > 0)
+        ]
+        flippers.sort(key=lambda m: (-m.instability_index, -m.flips, m.test_id))
+    else:
+        stats = compute_flake_stats(results)
+        flippers = [s for s in stats.values() if s.total_runs >= min_total and s.has_flip]
+        flippers.sort(key=lambda s: (-(s.fail_ratio), s.test_id))
 
     if json_out:
-        payload = {
-            "files_count": len(files),
-            "cases_count": len(results),
-            "flaky_candidates_count": len(flippers),
-            "flaky_candidates": [
-                {
-                    "test_id": s.test_id,
-                    "total_runs": s.total_runs,
-                    "pass_count": s.pass_count,
-                    "fail_error_count": s.fail_error_count,
-                    "skipped_count": s.skipped_count,
-                    "has_flip": s.has_flip,
-                    "fail_ratio": s.fail_ratio,
-                }
-                for s in flippers
-            ],
-        }
+        if rich:
+            payload = {
+                "files_count": len(files),
+                "flaky_candidates_count": len(flippers),
+                "flaky_candidates": [
+                    {
+                        "test_id": m.test_id,
+                        "total_runs": m.total_runs,
+                        "pass_count": m.pass_count,
+                        "fail_error_count": m.fail_error_count,
+                        "skipped_count": m.skipped_count,
+                        "flips": m.flips,
+                        "instability_index": m.instability_index,
+                        "longest_pass_streak": m.longest_pass_streak,
+                        "longest_failerr_streak": m.longest_failerr_streak,
+                    }
+                    for m in flippers
+                ],
+            }
+        else:
+            payload = {
+                "files_count": len(files),
+                "cases_count": len(results),
+                "flaky_candidates_count": len(flippers),
+                "flaky_candidates": [
+                    {
+                        "test_id": s.test_id,
+                        "total_runs": s.total_runs,
+                        "pass_count": s.pass_count,
+                        "fail_error_count": s.fail_error_count,
+                        "skipped_count": s.skipped_count,
+                        "has_flip": s.has_flip,
+                        "fail_ratio": s.fail_ratio,
+                    }
+                    for s in flippers
+                ],
+            }
         typer.echo(json.dumps(payload, indent=2))
         return
     else:
-        header = f"Files: {len(files)} | Cases: {len(results)} | Flaky candidates: {len(flippers)}"
-        typer.echo(header)
-        for s in flippers:
-            line = (
-                f" - {s.test_id}: runs={s.total_runs} pass={s.pass_count} "
-                f"fail+error={s.fail_error_count} skipped={s.skipped_count} "
-                f"fail_ratio={s.fail_ratio:.2f}"
+        if rich:
+            header = f"Files: {len(files)} | Flaky candidates: {len(flippers)}"
+        else:
+            header = (
+                f"Files: {len(files)} | Cases: {len(results)} | Flaky candidates: {len(flippers)}"
             )
-            typer.echo(line)
+        typer.echo(header)
+        if rich:
+            for m in flippers:
+                line = (
+                    f" - {m.test_id}: runs={m.total_runs} flips={m.flips} "
+                    f"instability={m.instability_index:.2f} pass_streak={m.longest_pass_streak} "
+                    f"fail_streak={m.longest_failerr_streak}"
+                )
+                typer.echo(line)
+        else:
+            for s in flippers:
+                line = (
+                    f" - {s.test_id}: runs={s.total_runs} pass={s.pass_count} "
+                    f"fail+error={s.fail_error_count} skipped={s.skipped_count} "
+                    f"fail_ratio={s.fail_ratio:.2f}"
+                )
+                typer.echo(line)
         if gh_summary:
             lines = ["### flakewall score", header]
-            for s in flippers:
-                lines.append(f"- {s.test_id} (runs={s.total_runs}, fail_ratio={s.fail_ratio:.2f})")
+            if rich:
+                for m in flippers:
+                    lines.append(
+                        f"- {m.test_id} (runs={m.total_runs}, flips={m.flips}, instability={m.instability_index:.2f})"
+                    )
+            else:
+                for s in flippers:
+                    lines.append(
+                        f"- {s.test_id} (runs={s.total_runs}, fail_ratio={s.fail_ratio:.2f})"
+                    )
             write_step_summary(lines)
 
 
diff --git a/flakewall/junit.py b/flakewall/junit.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Iterable, List, Dict
+from typing import Iterable, List, Dict, Tuple
 import xml.etree.ElementTree as ET
 
 
@@ -38,6 +38,17 @@ def parse_junit_files(paths: Iterable[Path]) -> List[CaseResult]:
     return results
 
 
+def parse_junit_file(path: Path) -> List[CaseResult]:
+    return parse_junit_files([path])
+
+
+def parse_junit_files_grouped(paths: Iterable[Path]) -> Dict[Path, List[CaseResult]]:
+    grouped: Dict[Path, List[CaseResult]] = {}
+    for path in paths:
+        grouped[path] = parse_junit_file(path)
+    return grouped
+
+
 def failing_ids(results: Iterable[CaseResult]) -> List[str]:
     return [r.test_id for r in results if r.status in {"fail", "error"}]
 
@@ -90,3 +101,93 @@ def compute_flake_stats(results: Iterable[CaseResult]) -> Dict[str, FlakeStats]:
             skipped_count=c["skipped"],
         )
     return stats
+
+
+@dataclass(frozen=True)
+class FlakeMetrics:
+    test_id: str
+    total_runs: int
+    pass_count: int
+    fail_error_count: int
+    skipped_count: int
+    flips: int
+    instability_index: float  # flips normalized by max possible flips
+    longest_pass_streak: int
+    longest_failerr_streak: int
+
+
+def compute_flake_metrics(
+    grouped_results: Dict[Path, List[CaseResult]],
+    order: str = "name",  # name|mtime
+) -> Dict[str, FlakeMetrics]:
+    def status_key(status: str) -> int:
+        return 1 if status in {"fail", "error"} else 0 if status == "pass" else 2
+
+    # Order files deterministically
+    files = list(grouped_results.keys())
+    if order == "mtime":
+        files.sort(key=lambda p: p.stat().st_mtime)
+    else:
+        files.sort(key=lambda p: str(p))
+
+    # Build per-test status sequences across files
+    test_to_sequence: Dict[str, List[str]] = {}
+    counts: Dict[str, Dict[str, int]] = {}
+    for path in files:
+        for r in grouped_results.get(path, []):
+            seq = test_to_sequence.setdefault(r.test_id, [])
+            seq.append(r.status)
+            bucket = counts.setdefault(
+                r.test_id, {"total": 0, "pass": 0, "failerr": 0, "skipped": 0}
+            )
+            bucket["total"] += 1
+            if r.status == "pass":
+                bucket["pass"] += 1
+            elif r.status in {"fail", "error"}:
+                bucket["failerr"] += 1
+            elif r.status == "skipped":
+                bucket["skipped"] += 1
+
+    metrics: Dict[str, FlakeMetrics] = {}
+    for test_id, seq in test_to_sequence.items():
+        if not seq:
+            continue
+        flips = 0
+        longest_pass = 0
+        longest_failerr = 0
+        current_pass = 0
+        current_failerr = 0
+        prev_bucket = None
+        for status in seq:
+            bucket = status_key(status)
+            if prev_bucket is not None and bucket != prev_bucket and 2 not in (bucket, prev_bucket):
+                flips += 1
+            if bucket == 0:  # pass
+                current_pass += 1
+                current_failerr = 0
+            elif bucket == 1:  # fail/error
+                current_failerr += 1
+                current_pass = 0
+            else:  # skipped
+                # reset streaks on skipped
+                current_pass = 0
+                current_failerr = 0
+            longest_pass = max(longest_pass, current_pass)
+            longest_failerr = max(longest_failerr, current_failerr)
+            prev_bucket = bucket
+
+        total_runs = counts[test_id]["total"]
+        max_possible_flips = max(0, total_runs - 1)
+        instability = (flips / max_possible_flips) if max_possible_flips > 0 else 0.0
+        metrics[test_id] = FlakeMetrics(
+            test_id=test_id,
+            total_runs=total_runs,
+            pass_count=counts[test_id]["pass"],
+            fail_error_count=counts[test_id]["failerr"],
+            skipped_count=counts[test_id]["skipped"],
+            flips=flips,
+            instability_index=instability,
+            longest_pass_streak=longest_pass,
+            longest_failerr_streak=longest_failerr,
+        )
+    return metrics
diff --git a/flakewall/runner.py b/flakewall/runner.py
@@ -169,5 +169,122 @@ def retry_tests(
             working_dir=working_dir,
             dry_run=dry_run,
         )
+    elif framework == "vitest":
+        # Similar to jest: vitest -t <name> [file]
+        def _vitest_args_from_test_id(tid: str) -> List[str]:
+            file_arg = None
+            name = tid
+            if "::" in tid:
+                head, name = tid.split("::", 1)
+                if "/" in head:
+                    file_arg = head
+            args = []
+            if file_arg:
+                args.append(file_arg)
+            args.extend(["-t", name])
+            return args
+
+        outcomes: List[RetryOutcome] = []
+        for tid in test_ids:
+            cmd_list = shlex.split(base_cmd or "vitest run") + _vitest_args_from_test_id(tid)
+            attempts = 0
+            passed_once = False
+            first_pass = None
+            for _ in range(0, max_retries + 1):
+                attempts += 1
+                if dry_run:
+                    print("DRY RUN: would run:", " ".join(shlex.quote(p) for p in cmd_list))
+                    continue
+                try:
+                    res = subprocess.run(
+                        cmd_list, cwd=working_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+                    )
+                    success = res.returncode == 0
+                except FileNotFoundError:
+                    raise SystemExit("vitest not found; provide --cmd (e.g., 'npx vitest run')")
+                if first_pass is None:
+                    first_pass = success
+                if success:
+                    passed_once = True
+                    break
+            outcomes.append(
+                RetryOutcome(
+                    test_id=tid,
+                    attempts=attempts,
+                    first_attempt_passed=bool(first_pass),
+                    eventually_passed=passed_once,
+                )
+            )
+        return outcomes
+    elif framework == "go":
+        # Run 'go test -run <name>' where name is a regex fragment
+        outcomes: List[RetryOutcome] = []
+        for tid in test_ids:
+            name = tid.split("::", 1)[-1]
+            cmd = base_cmd or f"go test -run {shlex.quote(name)} ./..."
+            attempts = 0
+            passed_once = False
+            first_pass = None
+            for _ in range(0, max_retries + 1):
+                attempts += 1
+                if dry_run:
+                    print(f"DRY RUN: would run: {cmd}")
+                    continue
+                res = subprocess.run(shlex.split(cmd), cwd=working_dir)
+                success = res.returncode == 0
+                if first_pass is None:
+                    first_pass = success
+                if success:
+                    passed_once = True
+                    break
+            outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
+        return outcomes
+    elif framework == ".net" or framework == "dotnet":
+        # dotnet test --filter FullyQualifiedName~<name>
+        outcomes: List[RetryOutcome] = []
+        for tid in test_ids:
+            name = tid.split("::", 1)[-1]
+            cmd = base_cmd or f"dotnet test --filter FullyQualifiedName~{shlex.quote(name)}"
+            attempts = 0
+            passed_once = False
+            first_pass = None
+            for _ in range(0, max_retries + 1):
+                attempts += 1
+                if dry_run:
+                    print(f"DRY RUN: would run: {cmd}")
+                    continue
+                res = subprocess.run(shlex.split(cmd), cwd=working_dir)
+                success = res.returncode == 0
+                if first_pass is None:
+                    first_pass = success
+                if success:
+                    passed_once = True
+                    break
+            outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
+        return outcomes
+    elif framework == "shell":
+        # Generic shell command template: base_cmd should contain {test}
+        if not base_cmd or "{test}" not in base_cmd:
+            raise SystemExit("for framework=shell, provide --cmd with a {test} placeholder")
+        outcomes: List[RetryOutcome] = []
+        for tid in test_ids:
+            cmd = base_cmd.replace("{test}", tid)
+            attempts = 0
+            passed_once = False
+            first_pass = None
+            for _ in range(0, max_retries + 1):
+                attempts += 1
+                if dry_run:
+                    print(f"DRY RUN: would run: {cmd}")
+                    continue
+                res = subprocess.run(cmd, cwd=working_dir, shell=True)
+                success = res.returncode == 0
+                if first_pass is None:
+                    first_pass = success
+                if success:
+                    passed_once = True
+                    break
+            outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
+        return outcomes
     else:
         raise SystemExit(f"Unsupported framework: {framework}")