Skip to content

Commit 833c21c

Browse files
committed
feat: richer flake scoring (flips/instability/streaks); add vitest/go/dotnet/shell runners
1 parent 1e89279 commit 833c21c

3 files changed

Lines changed: 311 additions & 33 deletions

File tree

flakewall/cli.py

Lines changed: 92 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99
import typer
1010

1111
from .config import FlakewallConfig, ensure_default_files, CONFIG_PATH
12-
from .junit import parse_junit_files, failing_ids, compute_flake_stats
12+
from .junit import (
13+
parse_junit_files,
14+
parse_junit_files_grouped,
15+
failing_ids,
16+
compute_flake_stats,
17+
compute_flake_metrics,
18+
)
1319
from .quarantine import load_quarantined, add_to_quarantine
1420
from .runner import retry_tests
1521
from . import __version__
@@ -116,51 +122,105 @@ def score(
116122
gh_summary: bool = typer.Option(
117123
False, help="Write summary of flaky candidates to GITHUB_STEP_SUMMARY"
118124
),
125+
rich: bool = typer.Option(
126+
False, help="Compute richer metrics (flips, instability, streaks) across runs"
127+
),
119128
) -> None:
120129
"""Compute minimal flake stats from a set of JUnit XML files and print tests that flipped."""
121130
cfg = FlakewallConfig.load()
122131
pattern = junit or cfg.report_glob
123132
files = [Path(p) for p in glob.glob(pattern, recursive=True)]
124133
results = parse_junit_files(files)
125-
stats = compute_flake_stats(results)
126-
127-
flippers = [s for s in stats.values() if s.total_runs >= min_total and s.has_flip]
128-
flippers.sort(key=lambda s: (-(s.fail_ratio), s.test_id))
134+
if rich:
135+
grouped = parse_junit_files_grouped(files)
136+
metrics = compute_flake_metrics(grouped)
137+
flippers = [
138+
m
139+
for m in metrics.values()
140+
if m.total_runs >= min_total and (m.fail_error_count > 0 and m.pass_count > 0)
141+
]
142+
flippers.sort(key=lambda m: (-m.instability_index, -m.flips, m.test_id))
143+
else:
144+
stats = compute_flake_stats(results)
145+
flippers = [s for s in stats.values() if s.total_runs >= min_total and s.has_flip]
146+
flippers.sort(key=lambda s: (-(s.fail_ratio), s.test_id))
129147

130148
if json_out:
131-
payload = {
132-
"files_count": len(files),
133-
"cases_count": len(results),
134-
"flaky_candidates_count": len(flippers),
135-
"flaky_candidates": [
136-
{
137-
"test_id": s.test_id,
138-
"total_runs": s.total_runs,
139-
"pass_count": s.pass_count,
140-
"fail_error_count": s.fail_error_count,
141-
"skipped_count": s.skipped_count,
142-
"has_flip": s.has_flip,
143-
"fail_ratio": s.fail_ratio,
144-
}
145-
for s in flippers
146-
],
147-
}
149+
if rich:
150+
payload = {
151+
"files_count": len(files),
152+
"flaky_candidates_count": len(flippers),
153+
"flaky_candidates": [
154+
{
155+
"test_id": m.test_id,
156+
"total_runs": m.total_runs,
157+
"pass_count": m.pass_count,
158+
"fail_error_count": m.fail_error_count,
159+
"skipped_count": m.skipped_count,
160+
"flips": m.flips,
161+
"instability_index": m.instability_index,
162+
"longest_pass_streak": m.longest_pass_streak,
163+
"longest_failerr_streak": m.longest_failerr_streak,
164+
}
165+
for m in flippers
166+
],
167+
}
168+
else:
169+
payload = {
170+
"files_count": len(files),
171+
"cases_count": len(results),
172+
"flaky_candidates_count": len(flippers),
173+
"flaky_candidates": [
174+
{
175+
"test_id": s.test_id,
176+
"total_runs": s.total_runs,
177+
"pass_count": s.pass_count,
178+
"fail_error_count": s.fail_error_count,
179+
"skipped_count": s.skipped_count,
180+
"has_flip": s.has_flip,
181+
"fail_ratio": s.fail_ratio,
182+
}
183+
for s in flippers
184+
],
185+
}
148186
typer.echo(json.dumps(payload, indent=2))
149187
return
150188
else:
151-
header = f"Files: {len(files)} | Cases: {len(results)} | Flaky candidates: {len(flippers)}"
152-
typer.echo(header)
153-
for s in flippers:
154-
line = (
155-
f" - {s.test_id}: runs={s.total_runs} pass={s.pass_count} "
156-
f"fail+error={s.fail_error_count} skipped={s.skipped_count} "
157-
f"fail_ratio={s.fail_ratio:.2f}"
189+
if rich:
190+
header = f"Files: {len(files)} | Flaky candidates: {len(flippers)}"
191+
else:
192+
header = (
193+
f"Files: {len(files)} | Cases: {len(results)} | Flaky candidates: {len(flippers)}"
158194
)
159-
typer.echo(line)
195+
typer.echo(header)
196+
if rich:
197+
for m in flippers:
198+
line = (
199+
f" - {m.test_id}: runs={m.total_runs} flips={m.flips} "
200+
f"instability={m.instability_index:.2f} pass_streak={m.longest_pass_streak} "
201+
f"fail_streak={m.longest_failerr_streak}"
202+
)
203+
typer.echo(line)
204+
else:
205+
for s in flippers:
206+
line = (
207+
f" - {s.test_id}: runs={s.total_runs} pass={s.pass_count} "
208+
f"fail+error={s.fail_error_count} skipped={s.skipped_count} "
209+
f"fail_ratio={s.fail_ratio:.2f}"
210+
)
211+
typer.echo(line)
160212
if gh_summary:
161213
lines = ["### flakewall score", header]
162-
for s in flippers:
163-
lines.append(f"- {s.test_id} (runs={s.total_runs}, fail_ratio={s.fail_ratio:.2f})")
214+
if rich:
215+
for m in flippers:
216+
lines.append(
217+
f"- {m.test_id} (runs={m.total_runs}, flips={m.flips}, instability={m.instability_index:.2f})"
218+
)
219+
else:
220+
for s in flippers:
221+
lines.append(
222+
f"- {s.test_id} (runs={s.total_runs}, fail_ratio={s.fail_ratio:.2f})"
223+
)
164224
write_step_summary(lines)
165225

166226

flakewall/junit.py

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from dataclasses import dataclass
44
from pathlib import Path
5-
from typing import Iterable, List, Dict
5+
from typing import Iterable, List, Dict, Tuple
66
import xml.etree.ElementTree as ET
77

88

@@ -38,6 +38,17 @@ def parse_junit_files(paths: Iterable[Path]) -> List[CaseResult]:
3838
return results
3939

4040

41+
def parse_junit_file(path: Path) -> List[CaseResult]:
42+
return parse_junit_files([path])
43+
44+
45+
def parse_junit_files_grouped(paths: Iterable[Path]) -> Dict[Path, List[CaseResult]]:
46+
grouped: Dict[Path, List[CaseResult]] = {}
47+
for path in paths:
48+
grouped[path] = parse_junit_file(path)
49+
return grouped
50+
51+
4152
def failing_ids(results: Iterable[CaseResult]) -> List[str]:
4253
return [r.test_id for r in results if r.status in {"fail", "error"}]
4354

@@ -90,3 +101,93 @@ def compute_flake_stats(results: Iterable[CaseResult]) -> Dict[str, FlakeStats]:
90101
skipped_count=c["skipped"],
91102
)
92103
return stats
104+
105+
106+
@dataclass(frozen=True)
107+
class FlakeMetrics:
108+
test_id: str
109+
total_runs: int
110+
pass_count: int
111+
fail_error_count: int
112+
skipped_count: int
113+
flips: int
114+
instability_index: float # flips normalized by max possible flips
115+
longest_pass_streak: int
116+
longest_failerr_streak: int
117+
118+
119+
def compute_flake_metrics(
120+
grouped_results: Dict[Path, List[CaseResult]],
121+
order: str = "name", # name|mtime
122+
) -> Dict[str, FlakeMetrics]:
123+
def status_key(status: str) -> int:
124+
return 1 if status in {"fail", "error"} else 0 if status == "pass" else 2
125+
126+
# Order files deterministically
127+
files = list(grouped_results.keys())
128+
if order == "mtime":
129+
files.sort(key=lambda p: p.stat().st_mtime)
130+
else:
131+
files.sort(key=lambda p: str(p))
132+
133+
# Build per-test status sequences across files
134+
test_to_sequence: Dict[str, List[str]] = {}
135+
counts: Dict[str, Dict[str, int]] = {}
136+
for path in files:
137+
for r in grouped_results.get(path, []):
138+
seq = test_to_sequence.setdefault(r.test_id, [])
139+
seq.append(r.status)
140+
bucket = counts.setdefault(
141+
r.test_id, {"total": 0, "pass": 0, "failerr": 0, "skipped": 0}
142+
)
143+
bucket["total"] += 1
144+
if r.status == "pass":
145+
bucket["pass"] += 1
146+
elif r.status in {"fail", "error"}:
147+
bucket["failerr"] += 1
148+
elif r.status == "skipped":
149+
bucket["skipped"] += 1
150+
151+
metrics: Dict[str, FlakeMetrics] = {}
152+
for test_id, seq in test_to_sequence.items():
153+
if not seq:
154+
continue
155+
flips = 0
156+
longest_pass = 0
157+
longest_failerr = 0
158+
current_pass = 0
159+
current_failerr = 0
160+
prev_bucket = None
161+
for status in seq:
162+
bucket = status_key(status)
163+
if prev_bucket is not None and bucket != prev_bucket and 2 not in (bucket, prev_bucket):
164+
flips += 1
165+
if bucket == 0: # pass
166+
current_pass += 1
167+
current_failerr = 0
168+
elif bucket == 1: # fail/error
169+
current_failerr += 1
170+
current_pass = 0
171+
else: # skipped
172+
# reset streaks on skipped
173+
current_pass = 0
174+
current_failerr = 0
175+
longest_pass = max(longest_pass, current_pass)
176+
longest_failerr = max(longest_failerr, current_failerr)
177+
prev_bucket = bucket
178+
179+
total_runs = counts[test_id]["total"]
180+
max_possible_flips = max(0, total_runs - 1)
181+
instability = (flips / max_possible_flips) if max_possible_flips > 0 else 0.0
182+
metrics[test_id] = FlakeMetrics(
183+
test_id=test_id,
184+
total_runs=total_runs,
185+
pass_count=counts[test_id]["pass"],
186+
fail_error_count=counts[test_id]["failerr"],
187+
skipped_count=counts[test_id]["skipped"],
188+
flips=flips,
189+
instability_index=instability,
190+
longest_pass_streak=longest_pass,
191+
longest_failerr_streak=longest_failerr,
192+
)
193+
return metrics

flakewall/runner.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,5 +169,122 @@ def retry_tests(
169169
working_dir=working_dir,
170170
dry_run=dry_run,
171171
)
172+
elif framework == "vitest":
173+
# Similar to jest: vitest -t <name> [file]
174+
def _vitest_args_from_test_id(tid: str) -> List[str]:
175+
file_arg = None
176+
name = tid
177+
if "::" in tid:
178+
head, name = tid.split("::", 1)
179+
if "/" in head:
180+
file_arg = head
181+
args = []
182+
if file_arg:
183+
args.append(file_arg)
184+
args.extend(["-t", name])
185+
return args
186+
187+
outcomes: List[RetryOutcome] = []
188+
for tid in test_ids:
189+
cmd_list = shlex.split(base_cmd or "vitest run") + _vitest_args_from_test_id(tid)
190+
attempts = 0
191+
passed_once = False
192+
first_pass = None
193+
for _ in range(0, max_retries + 1):
194+
attempts += 1
195+
if dry_run:
196+
print("DRY RUN: would run:", " ".join(shlex.quote(p) for p in cmd_list))
197+
continue
198+
try:
199+
res = subprocess.run(
200+
cmd_list, cwd=working_dir, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
201+
)
202+
success = res.returncode == 0
203+
except FileNotFoundError:
204+
raise SystemExit("vitest not found; provide --cmd (e.g., 'npx vitest run')")
205+
if first_pass is None:
206+
first_pass = success
207+
if success:
208+
passed_once = True
209+
break
210+
outcomes.append(
211+
RetryOutcome(
212+
test_id=tid,
213+
attempts=attempts,
214+
first_attempt_passed=bool(first_pass),
215+
eventually_passed=passed_once,
216+
)
217+
)
218+
return outcomes
219+
elif framework == "go":
220+
# Run 'go test -run <name>' where name is a regex fragment
221+
outcomes: List[RetryOutcome] = []
222+
for tid in test_ids:
223+
name = tid.split("::", 1)[-1]
224+
cmd = base_cmd or f"go test -run {shlex.quote(name)} ./..."
225+
attempts = 0
226+
passed_once = False
227+
first_pass = None
228+
for _ in range(0, max_retries + 1):
229+
attempts += 1
230+
if dry_run:
231+
print(f"DRY RUN: would run: {cmd}")
232+
continue
233+
res = subprocess.run(shlex.split(cmd), cwd=working_dir)
234+
success = res.returncode == 0
235+
if first_pass is None:
236+
first_pass = success
237+
if success:
238+
passed_once = True
239+
break
240+
outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
241+
return outcomes
242+
elif framework == ".net" or framework == "dotnet":
243+
# dotnet test --filter FullyQualifiedName~<name>
244+
outcomes: List[RetryOutcome] = []
245+
for tid in test_ids:
246+
name = tid.split("::", 1)[-1]
247+
cmd = base_cmd or f"dotnet test --filter FullyQualifiedName~{shlex.quote(name)}"
248+
attempts = 0
249+
passed_once = False
250+
first_pass = None
251+
for _ in range(0, max_retries + 1):
252+
attempts += 1
253+
if dry_run:
254+
print(f"DRY RUN: would run: {cmd}")
255+
continue
256+
res = subprocess.run(shlex.split(cmd), cwd=working_dir)
257+
success = res.returncode == 0
258+
if first_pass is None:
259+
first_pass = success
260+
if success:
261+
passed_once = True
262+
break
263+
outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
264+
return outcomes
265+
elif framework == "shell":
266+
# Generic shell command template: base_cmd should contain {test}
267+
if not base_cmd or "{test}" not in base_cmd:
268+
raise SystemExit("for framework=shell, provide --cmd with a {test} placeholder")
269+
outcomes: List[RetryOutcome] = []
270+
for tid in test_ids:
271+
cmd = base_cmd.replace("{test}", tid)
272+
attempts = 0
273+
passed_once = False
274+
first_pass = None
275+
for _ in range(0, max_retries + 1):
276+
attempts += 1
277+
if dry_run:
278+
print(f"DRY RUN: would run: {cmd}")
279+
continue
280+
res = subprocess.run(cmd, cwd=working_dir, shell=True)
281+
success = res.returncode == 0
282+
if first_pass is None:
283+
first_pass = success
284+
if success:
285+
passed_once = True
286+
break
287+
outcomes.append(RetryOutcome(tid, attempts, bool(first_pass), passed_once))
288+
return outcomes
172289
else:
173290
raise SystemExit(f"Unsupported framework: {framework}")

0 commit comments

Comments
 (0)