|
| 1 | +#!/usr/bin/env python |
| 2 | +"""Compare benchmark results across PR, main, and tag and output a markdown table.""" |
| 3 | + |
| 4 | +import json |
| 5 | +import statistics |
| 6 | +from pathlib import Path |
| 7 | +from typing import Literal, NamedTuple |
| 8 | + |
| 9 | + |
| 10 | +class BenchmarkResult(NamedTuple): |
| 11 | + fullname: str |
| 12 | + kind: Literal["index", "query"] |
| 13 | + locality: Literal["local", "remote"] | None = None |
| 14 | + workers: int | None = None |
| 15 | + median: float = 0.0 |
| 16 | + mean: float = 0.0 |
| 17 | + stddev: float = 0.0 |
| 18 | + |
| 19 | + |
| 20 | +def parse_file(path: Path) -> dict[str, BenchmarkResult]: |
| 21 | + data = json.loads(path.read_text()) |
| 22 | + results = {} |
| 23 | + for benchmark in data["benchmarks"]: |
| 24 | + fullname: str = benchmark["fullname"] |
| 25 | + data_trimmed = benchmark["stats"]["data"][1:] |
| 26 | + median = statistics.median(data_trimmed) |
| 27 | + mean = statistics.mean(data_trimmed) |
| 28 | + stddev = statistics.stdev(data_trimmed) |
| 29 | + |
| 30 | + if "query" in fullname: |
| 31 | + result = BenchmarkResult( |
| 32 | + fullname=fullname, kind="query", median=median, mean=mean, stddev=stddev |
| 33 | + ) |
| 34 | + else: |
| 35 | + locality: Literal["local", "remote"] = ( |
| 36 | + "remote" if "openneuro" in fullname or "s3" in fullname else "local" |
| 37 | + ) |
| 38 | + workers = benchmark["extra_info"].get("workers", "Unknown") |
| 39 | + result = BenchmarkResult( |
| 40 | + fullname=fullname, |
| 41 | + kind="index", |
| 42 | + locality=locality, |
| 43 | + workers=workers, |
| 44 | + median=median, |
| 45 | + mean=mean, |
| 46 | + stddev=stddev, |
| 47 | + ) |
| 48 | + results[fullname] = result |
| 49 | + return results |
| 50 | + |
| 51 | + |
| 52 | +def _scale(val: float) -> float: |
| 53 | + return val * 1000 |
| 54 | + |
| 55 | + |
| 56 | +def _fmt(res: BenchmarkResult) -> str: |
| 57 | + median = _scale(res.median) |
| 58 | + mean = _scale(res.mean) |
| 59 | + stddev = _scale(res.stddev) |
| 60 | + return f"{median:.3f} ({mean:.3f} ± {stddev:.3f}) ms" |
| 61 | + |
| 62 | + |
| 63 | +def _delta(pr: BenchmarkResult, ref: BenchmarkResult) -> str: |
| 64 | + if ref == 0: |
| 65 | + return "N/A" |
| 66 | + diff = _scale(pr.median - ref.median) |
| 67 | + pct = (pr.median / ref.median - 1) * 100 |
| 68 | + icon = "🔴" if pct > 5 else "🟢" if pct < -5 else "⚪" |
| 69 | + return f"{icon} {diff:+.3f} ms ({pct:+.1f}%)" |
| 70 | + |
| 71 | + |
| 72 | +def _label(result: BenchmarkResult) -> str: |
| 73 | + if result.kind == "query": |
| 74 | + return ( |
| 75 | + result.fullname.split("::")[-1] |
| 76 | + .replace("test_", "") |
| 77 | + .replace("_", " ") |
| 78 | + .capitalize() |
| 79 | + ) |
| 80 | + return f"{result.locality.capitalize()} index ({result.workers} workers)" |
| 81 | + |
| 82 | + |
| 83 | +def build_table( |
| 84 | + pr: dict[str, BenchmarkResult], |
| 85 | + main: dict[str, BenchmarkResult], |
| 86 | + tag: dict[str, BenchmarkResult], |
| 87 | + tag_name: str, |
| 88 | +) -> str: |
| 89 | + all_keys = set(pr) | set(main) | set(tag) |
| 90 | + labels = [_label((pr.get(k) or main.get(k) or tag.get(k))) for k in all_keys] |
| 91 | + |
| 92 | + col_sep = " | " |
| 93 | + header = "| |" + col_sep.join(f" **{label}** " for label in labels) + " |" |
| 94 | + divider = "|-|" + "|".join("---" for _ in all_keys) + "|" |
| 95 | + |
| 96 | + def row(name: str, results: dict[str, BenchmarkResult]) -> str: |
| 97 | + cells = [_fmt(results[k]) if k in results else "—" for k in all_keys] |
| 98 | + return "| **" + name + "** |" + col_sep.join(f" {c} " for c in cells) + " |" |
| 99 | + |
| 100 | + def delta_row(label: str, ref: dict[str, BenchmarkResult]) -> str: |
| 101 | + cells = [ |
| 102 | + _delta(pr[k], ref[k]) if k in pr and k in ref else "—" for k in all_keys |
| 103 | + ] |
| 104 | + return "| *" + label + "* |" + col_sep.join(f" {c} " for c in cells) + " |" |
| 105 | + |
| 106 | + lines = [ |
| 107 | + "## Benchmark Results", |
| 108 | + "", |
| 109 | + header, |
| 110 | + divider, |
| 111 | + row("PR", pr), |
| 112 | + row("main", main), |
| 113 | + row(tag_name, tag), |
| 114 | + divider.replace("-", ""), |
| 115 | + delta_row("PR vs main", main), |
| 116 | + delta_row(f"PR vs {tag_name}", tag), |
| 117 | + "", |
| 118 | + "> `median (mean ± std)`", |
| 119 | + "> ", |
| 120 | + "🔴 >5% slower ⚪ within 5% 🟢 >5% faster", |
| 121 | + ] |
| 122 | + return "\n".join(lines) |
| 123 | + |
| 124 | + |
| 125 | +def main(): |
| 126 | + import argparse |
| 127 | + |
| 128 | + parser = argparse.ArgumentParser() |
| 129 | + parser.add_argument( |
| 130 | + "--pattern", |
| 131 | + default="benchmark-*.json", |
| 132 | + help="Glob pattern for benchmark JSON files", |
| 133 | + ) |
| 134 | + parser.add_argument( |
| 135 | + "-o", |
| 136 | + "--output", |
| 137 | + help="Output markdown filepath containing benchmark comparisons", |
| 138 | + ) |
| 139 | + args = parser.parse_args() |
| 140 | + |
| 141 | + files = sorted(Path(".").glob(args.pattern)) |
| 142 | + assert len(files) == 3, f"Expected 3 files, found {len(files)}: {files}" |
| 143 | + |
| 144 | + # Infer pr/main/tag from directory name |
| 145 | + parsed: dict[str, BenchmarkResult] = {} |
| 146 | + tag = None |
| 147 | + for f in files: |
| 148 | + stem = f.parent.name # e.g. "benchmark-pr" |
| 149 | + key = stem.split("-")[-1] # "pr", "main", tag |
| 150 | + if key not in ("pr", "main"): |
| 151 | + tag = key |
| 152 | + parsed[key] = parse_file(f) |
| 153 | + if tag is None: |
| 154 | + raise ValueError("Unknown tag") |
| 155 | + table = build_table(parsed["pr"], parsed["main"], parsed[tag], tag_name=tag) |
| 156 | + args.output.write_text(table) |
| 157 | + print(table) |
| 158 | + |
| 159 | + |
| 160 | +if __name__ == "__main__": |
| 161 | + main() |
0 commit comments