Skip to content

Commit d0b8881

Browse files
committed
Use a per-benchmarks baseline instead of the last fully succesful run
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 47d2041 commit d0b8881

4 files changed

Lines changed: 122 additions & 18 deletions

File tree

.github/workflows/bench-pr.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,8 @@ jobs:
9898
run: |
9999
set -Eeu -o pipefail -x
100100
101-
base_commit_sha=$(\
102-
curl -L \
103-
-H "Accept: application/vnd.github+json" \
104-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
105-
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
106-
| jq -r '.workflow_runs[].head_sha' \
107-
)
108-
109101
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
110-
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
102+
gzip -d -c data.json.gz > base.json
111103
112104
echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
113105
echo '' >> comment.md

.github/workflows/sql-benchmarks.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,16 +443,8 @@ jobs:
443443
run: |
444444
set -Eeu -o pipefail -x
445445
446-
base_commit_sha=$(\
447-
curl -L \
448-
-H "Accept: application/vnd.github+json" \
449-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
450-
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
451-
| jq -r '.workflow_runs[].head_sha' \
452-
)
453-
454446
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
455-
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
447+
gzip -d -c data.json.gz > base.json
456448
457449
echo '# Benchmarks: ${{ matrix.name }}' > comment.md
458450
echo '' >> comment.md

scripts/compare-benchmark-jsons.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,63 @@ def split_file_size_rows(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
7777
return df[mask].copy(), df[~mask].copy()
7878

7979

80+
def identity_value(value: Any) -> Any:
81+
"""Normalize missing values so benchmark identities compare reliably."""
82+
83+
return None if pd.isna(value) else value
84+
85+
86+
def benchmark_identity_rows(df: pd.DataFrame) -> pd.DataFrame:
87+
"""Return timing rows with the identity used to match a PR benchmark."""
88+
89+
_file_size_rows, timing_rows = split_file_size_rows(df)
90+
if timing_rows.empty or "name" not in timing_rows.columns:
91+
return pd.DataFrame(columns=["commit_id", "benchmark_identity"])
92+
93+
timing_rows = timing_rows.copy()
94+
if "storage" not in timing_rows.columns:
95+
timing_rows["storage"] = pd.NA
96+
if "commit_id" not in timing_rows.columns:
97+
timing_rows["commit_id"] = pd.NA
98+
99+
timing_rows = extract_dataset_key(timing_rows)
100+
timing_rows["benchmark_identity"] = [
101+
tuple(identity_value(row[column]) for column in ("name", "storage", "dataset_key"))
102+
for _, row in timing_rows.iterrows()
103+
]
104+
105+
return timing_rows[["commit_id", "benchmark_identity"]]
106+
107+
108+
def select_latest_baseline_rows(base: pd.DataFrame, pr: pd.DataFrame) -> pd.DataFrame:
109+
"""Select rows from the latest baseline commit containing this benchmark.
110+
111+
The persisted benchmark history is append-only. A row only appears after
112+
that benchmark job uploaded results, so the newest commit with matching row
113+
identities is the latest successful baseline for the benchmark under test.
114+
"""
115+
116+
if base.empty or "commit_id" not in base.columns:
117+
return base
118+
119+
commit_ids = base["commit_id"].dropna().unique()
120+
if len(commit_ids) <= 1:
121+
return base
122+
123+
pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
124+
if not pr_identities:
125+
return base
126+
127+
base_identities = benchmark_identity_rows(base)
128+
matches = base_identities[base_identities["benchmark_identity"].isin(pr_identities)]
129+
matches = matches[matches["commit_id"].notna()]
130+
if matches.empty:
131+
raise ValueError("No baseline rows found for the benchmark under test")
132+
133+
baseline_commit_id = matches["commit_id"].iloc[-1]
134+
return base[base["commit_id"] == baseline_commit_id].copy()
135+
136+
80137
def extract_target_fields(name: str) -> pd.Series:
81138
"""Parse query, engine, and format from the benchmark name."""
82139

@@ -704,6 +761,7 @@ def main() -> None:
704761

705762
base = pd.read_json(sys.argv[1], lines=True)
706763
pr = pd.read_json(sys.argv[2], lines=True)
764+
base = select_latest_baseline_rows(base, pr)
707765

708766
base_commit_id = set(base["commit_id"].unique())
709767
pr_commit_id = set(pr["commit_id"].unique())

scripts/tests/test_benchmark_reporting.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,68 @@ def timing_row(name: str, base: int, pr: int) -> dict[str, object]:
3333
}
3434

3535

36+
def stored_timing_row(
37+
commit: str,
38+
name: str,
39+
value: int,
40+
storage: str | None = None,
41+
dataset: dict[str, object] | None = None,
42+
) -> dict[str, object]:
43+
row: dict[str, object] = {
44+
"name": name,
45+
"unit": "ns",
46+
"value": value,
47+
"all_runtimes": [value, value, value],
48+
"commit_id": commit,
49+
}
50+
if storage is not None:
51+
row["storage"] = storage
52+
if dataset is not None:
53+
row["dataset"] = dataset
54+
return row
55+
56+
57+
def test_select_latest_baseline_rows_uses_latest_matching_benchmark_commit() -> None:
58+
compare = load_compare_module()
59+
history = pd.DataFrame(
60+
[
61+
stored_timing_row(
62+
"base-old",
63+
"tpch_q01/datafusion:parquet",
64+
100,
65+
"nvme",
66+
{"scale_factor": "1.0"},
67+
),
68+
file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
69+
stored_timing_row(
70+
"base-current",
71+
"tpch_q01/datafusion:parquet",
72+
110,
73+
"nvme",
74+
{"scale_factor": "1.0"},
75+
),
76+
file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
77+
stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
78+
]
79+
)
80+
pr = pd.DataFrame(
81+
[
82+
stored_timing_row(
83+
"pr-sha",
84+
"tpch_q01/datafusion:parquet",
85+
115,
86+
"nvme",
87+
{"scale_factor": "1.0"},
88+
),
89+
]
90+
)
91+
92+
selected = compare.select_latest_baseline_rows(history, pr)
93+
94+
assert set(selected["commit_id"]) == {"base-current"}
95+
assert len(selected) == 2
96+
97+
3698
def test_within_engine_analysis_uses_each_engines_own_parquet_control() -> None:
3799
compare = load_compare_module()
38100
rows = [

0 commit comments

Comments
 (0)