Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions scripts/compare-bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,28 @@ def pct_change(base_ns: int, head_ns: int) -> float:
return ((head_ns - base_ns) / base_ns) * 100.0


def render_markdown(rows: list[tuple[str, int, int, float]], threshold_pct: float) -> str:
def status_for(delta_pct: float, abs_delta_ns: int, threshold_pct: float, min_abs_ns: int) -> str:
if delta_pct <= threshold_pct:
return "OK"
if abs_delta_ns <= min_abs_ns:
return "NOISE"
return "FAIL"


def render_markdown(rows: list[tuple[str, int, int, float, int]], threshold_pct: float, min_abs_ns: int) -> str:
lines = [
"## Benchmark Regression Report",
"",
f"Threshold: {threshold_pct:.2f}%",
f"Thresholds: {threshold_pct:.2f}% and {min_abs_ns:,} ns absolute delta",
"",
"`NOISE` means the percentage threshold was exceeded, but the absolute delta was too small to fail CI.",
"",
"| Tool | Base (ns) | Head (ns) | Delta | Status |",
"| --- | ---: | ---: | ---: | --- |",
"| Tool | Base (ns) | Head (ns) | Delta | Abs Delta (ns) | Status |",
"| --- | ---: | ---: | ---: | ---: | --- |",
]
for tool, base_ns, head_ns, delta in rows:
status = "FAIL" if delta > threshold_pct else "OK"
lines.append(f"| `{tool}` | {base_ns} | {head_ns} | {delta:+.2f}% | {status} |")
for tool, base_ns, head_ns, delta, abs_delta in rows:
status = status_for(delta, abs_delta, threshold_pct, min_abs_ns)
lines.append(f"| `{tool}` | {base_ns} | {head_ns} | {delta:+.2f}% | {abs_delta:+d} | {status} |")
return "\n".join(lines) + "\n"


Expand All @@ -57,21 +67,21 @@ def main() -> int:
return 1
common = sorted(set(base) & set(head))

rows: list[tuple[str, int, int, float]] = []
rows: list[tuple[str, int, int, float, int]] = []
failures: list[str] = []

for tool in common:
base_ns = int(base[tool]["avg_latency_ns"])
head_ns = int(head[tool]["avg_latency_ns"])
delta = pct_change(base_ns, head_ns)
abs_delta = head_ns - base_ns
rows.append((tool, base_ns, head_ns, delta))
rows.append((tool, base_ns, head_ns, delta, abs_delta))
# Only flag as regression if BOTH percentage AND absolute delta exceed thresholds
# This prevents false positives on fast tools where CI noise dominates
if delta > args.threshold_pct and abs_delta > args.min_abs_ns:
failures.append(f"{tool} regressed by {delta:.2f}%")
failures.append(f"{tool} regressed by {delta:.2f}% ({abs_delta:+d} ns)")

report = render_markdown(rows, args.threshold_pct)
report = render_markdown(rows, args.threshold_pct, args.min_abs_ns)
sys.stdout.write(report)

if args.markdown_out:
Expand Down
35 changes: 35 additions & 0 deletions scripts/test_compare_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3
from __future__ import annotations

import importlib.util
from pathlib import Path
import unittest


SCRIPT = Path(__file__).with_name("compare-bench.py")
spec = importlib.util.spec_from_file_location("compare_bench", SCRIPT)
assert spec is not None
compare_bench = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(compare_bench)


class CompareBenchTests(unittest.TestCase):
def test_small_absolute_regression_is_noise(self) -> None:
self.assertEqual(compare_bench.status_for(22.54, 11_399, 10.0, 50_000), "NOISE")

def test_large_absolute_regression_fails(self) -> None:
self.assertEqual(compare_bench.status_for(12.0, 75_000, 10.0, 50_000), "FAIL")

def test_report_explains_noise_status(self) -> None:
report = compare_bench.render_markdown(
[("codedb_read", 50_580, 61_979, 22.54, 11_399)],
10.0,
50_000,
)
self.assertIn("50,000 ns absolute delta", report)
self.assertIn("| `codedb_read` | 50580 | 61979 | +22.54% | +11399 | NOISE |", report)


if __name__ == "__main__":
unittest.main()
Loading