Skip to content

Commit 0a41704

Browse files
authored
Use a per-benchmarks baseline instead of the last fully succesful run (#8332)
## Summary Currently - when one post-merge benchmarks fails for any reason, the baseline we use is the last fully successful run. This PR changes it to be the last successful run of that specific benchmark. Tested this works by running it in such a state and visually verifying the results. --------- Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 630484f commit 0a41704

4 files changed

Lines changed: 232 additions & 22 deletions

File tree

.github/workflows/bench-pr.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,8 @@ jobs:
9898
run: |
9999
set -Eeu -o pipefail -x
100100
101-
base_commit_sha=$(\
102-
curl -L \
103-
-H "Accept: application/vnd.github+json" \
104-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
105-
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
106-
| jq -r '.workflow_runs[].head_sha' \
107-
)
108-
109101
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
110-
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
102+
gzip -d -c data.json.gz > base.json
111103
112104
echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
113105
echo '' >> comment.md

.github/workflows/sql-benchmarks.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,16 +443,8 @@ jobs:
443443
run: |
444444
set -Eeu -o pipefail -x
445445
446-
base_commit_sha=$(\
447-
curl -L \
448-
-H "Accept: application/vnd.github+json" \
449-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
450-
https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
451-
| jq -r '.workflow_runs[].head_sha' \
452-
)
453-
454446
python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
455-
gzip -d -c data.json.gz | grep $base_commit_sha > base.json
447+
gzip -d -c data.json.gz > base.json
456448
457449
echo '# Benchmarks: ${{ matrix.name }}' > comment.md
458450
echo '' >> comment.md

scripts/compare-benchmark-jsons.py

Lines changed: 124 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# "numpy",
55
# "pandas",
66
# "tabulate",
7+
# "orjson"
78
# ]
89
# ///
910

@@ -18,6 +19,7 @@
1819
from typing import Any
1920

2021
import numpy as np
22+
import orjson
2123
import pandas as pd
2224

2325
# Analysis overview:
@@ -59,9 +61,7 @@ def extract_dataset_key(df: pd.DataFrame) -> pd.DataFrame:
5961
if "dataset" not in df.columns:
6062
df["dataset_key"] = pd.NA
6163
else:
62-
df["dataset_key"] = df["dataset"].apply(
63-
lambda x: str(sorted(x.items())) if pd.notna(x) and isinstance(x, dict) else pd.NA
64-
)
64+
df["dataset_key"] = df["dataset"].apply(dataset_key)
6565
return df
6666

6767

@@ -77,6 +77,126 @@ def split_file_size_rows(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
7777
return df[mask].copy(), df[~mask].copy()
7878

7979

80+
def identity_value(value: Any) -> Any:
81+
"""Normalize missing values so benchmark identities compare reliably."""
82+
83+
return None if pd.isna(value) else value
84+
85+
86+
def dataset_key(value: Any) -> str | None:
87+
"""Normalize dataset metadata into the join-key representation."""
88+
89+
if isinstance(value, dict):
90+
return str(sorted(value.items()))
91+
return None
92+
93+
94+
def benchmark_identity(row: Any) -> tuple[Any, Any, Any] | None:
95+
"""Return the timing-row identity used to find a matching baseline."""
96+
97+
if row.get("metric") == FILE_SIZE_METRIC or row.get("file_size") is not None:
98+
return None
99+
100+
name = row.get("name")
101+
if name is None:
102+
return None
103+
104+
return (
105+
identity_value(name),
106+
identity_value(row.get("storage")),
107+
dataset_key(row.get("dataset")),
108+
)
109+
110+
111+
def benchmark_identity_rows(df: pd.DataFrame) -> pd.DataFrame:
112+
"""Return timing rows with the identity used to match a PR benchmark."""
113+
114+
_file_size_rows, timing_rows = split_file_size_rows(df)
115+
if timing_rows.empty or "name" not in timing_rows.columns:
116+
return pd.DataFrame(columns=["commit_id", "benchmark_identity"])
117+
118+
timing_rows = timing_rows.copy()
119+
if "storage" not in timing_rows.columns:
120+
timing_rows["storage"] = pd.NA
121+
if "commit_id" not in timing_rows.columns:
122+
timing_rows["commit_id"] = pd.NA
123+
124+
timing_rows = extract_dataset_key(timing_rows)
125+
timing_rows["benchmark_identity"] = [
126+
tuple(identity_value(row[column]) for column in ("name", "storage", "dataset_key"))
127+
for _, row in timing_rows.iterrows()
128+
]
129+
130+
return timing_rows[["commit_id", "benchmark_identity"]]
131+
132+
133+
def read_jsonl_rows_for_commit(path: str, commit_id: str) -> pd.DataFrame:
134+
"""Read only rows matching a commit from a JSONL benchmark history."""
135+
136+
rows = []
137+
with open(path, encoding="utf-8") as lines:
138+
for line in lines:
139+
if '"commit_id"' not in line or f'"{commit_id}"' not in line:
140+
continue
141+
record = orjson.loads(line)
142+
if record.get("commit_id") == commit_id:
143+
rows.append(record)
144+
return pd.DataFrame(rows)
145+
146+
147+
def read_latest_baseline_rows(path: str, pr: pd.DataFrame) -> pd.DataFrame:
148+
"""Read rows from the latest history commit matching the PR benchmark."""
149+
150+
pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
151+
if not pr_identities:
152+
return pd.read_json(path, lines=True)
153+
154+
baseline_commit_id = None
155+
with open(path, encoding="utf-8") as lines:
156+
for line in lines:
157+
if '"name"' not in line or '"commit_id"' not in line:
158+
continue
159+
record = orjson.loads(line)
160+
if benchmark_identity(record) in pr_identities:
161+
commit_id = record.get("commit_id")
162+
if commit_id is not None:
163+
baseline_commit_id = commit_id
164+
165+
if baseline_commit_id is None:
166+
raise ValueError("No baseline rows found for the benchmark under test")
167+
168+
return read_jsonl_rows_for_commit(path, baseline_commit_id)
169+
170+
171+
def select_latest_baseline_rows(base: pd.DataFrame, pr: pd.DataFrame) -> pd.DataFrame:
172+
"""Select rows from the latest baseline commit containing this benchmark.
173+
174+
The persisted benchmark history is append-only. A row only appears after
175+
that benchmark job uploaded results, so the newest commit with matching row
176+
identities is the latest successful baseline for the benchmark under test.
177+
"""
178+
179+
if base.empty or "commit_id" not in base.columns:
180+
return base
181+
182+
commit_ids = base["commit_id"].dropna().unique()
183+
if len(commit_ids) <= 1:
184+
return base
185+
186+
pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
187+
if not pr_identities:
188+
return base
189+
190+
base_identities = benchmark_identity_rows(base)
191+
matches = base_identities[base_identities["benchmark_identity"].isin(pr_identities)]
192+
matches = matches[matches["commit_id"].notna()]
193+
if matches.empty:
194+
raise ValueError("No baseline rows found for the benchmark under test")
195+
196+
baseline_commit_id = matches["commit_id"].iloc[-1]
197+
return base[base["commit_id"] == baseline_commit_id].copy()
198+
199+
80200
def extract_target_fields(name: str) -> pd.Series:
81201
"""Parse query, engine, and format from the benchmark name."""
82202

@@ -702,8 +822,8 @@ def main() -> None:
702822

703823
benchmark_name = sys.argv[3] if len(sys.argv) > 3 else ""
704824

705-
base = pd.read_json(sys.argv[1], lines=True)
706825
pr = pd.read_json(sys.argv[2], lines=True)
826+
base = read_latest_baseline_rows(sys.argv[1], pr)
707827

708828
base_commit_id = set(base["commit_id"].unique())
709829
pr_commit_id = set(pr["commit_id"].unique())

scripts/tests/test_benchmark_reporting.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,112 @@ def timing_row(name: str, base: int, pr: int) -> dict[str, object]:
3333
}
3434

3535

36+
def stored_timing_row(
37+
commit: str,
38+
name: str,
39+
value: int,
40+
storage: str | None = None,
41+
dataset: dict[str, object] | None = None,
42+
) -> dict[str, object]:
43+
row: dict[str, object] = {
44+
"name": name,
45+
"unit": "ns",
46+
"value": value,
47+
"all_runtimes": [value, value, value],
48+
"commit_id": commit,
49+
}
50+
if storage is not None:
51+
row["storage"] = storage
52+
if dataset is not None:
53+
row["dataset"] = dataset
54+
return row
55+
56+
57+
def test_select_latest_baseline_rows_uses_latest_matching_benchmark_commit() -> None:
58+
compare = load_compare_module()
59+
history = pd.DataFrame(
60+
[
61+
stored_timing_row(
62+
"base-old",
63+
"tpch_q01/datafusion:parquet",
64+
100,
65+
"nvme",
66+
{"scale_factor": "1.0"},
67+
),
68+
file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
69+
stored_timing_row(
70+
"base-current",
71+
"tpch_q01/datafusion:parquet",
72+
110,
73+
"nvme",
74+
{"scale_factor": "1.0"},
75+
),
76+
file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
77+
stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
78+
]
79+
)
80+
pr = pd.DataFrame(
81+
[
82+
stored_timing_row(
83+
"pr-sha",
84+
"tpch_q01/datafusion:parquet",
85+
115,
86+
"nvme",
87+
{"scale_factor": "1.0"},
88+
),
89+
]
90+
)
91+
92+
selected = compare.select_latest_baseline_rows(history, pr)
93+
94+
assert set(selected["commit_id"]) == {"base-current"}
95+
assert len(selected) == 2
96+
97+
98+
def test_read_latest_baseline_rows_streams_latest_matching_benchmark_commit(tmp_path: Path) -> None:
99+
compare = load_compare_module()
100+
history_path = tmp_path / "history.jsonl"
101+
history_rows = [
102+
stored_timing_row(
103+
"base-old",
104+
"tpch_q01/datafusion:parquet",
105+
100,
106+
"nvme",
107+
{"scale_factor": "1.0"},
108+
),
109+
file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
110+
stored_timing_row(
111+
"base-current",
112+
"tpch_q01/datafusion:parquet",
113+
110,
114+
"nvme",
115+
{"scale_factor": "1.0"},
116+
),
117+
file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
118+
stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
119+
]
120+
history_path.write_text(
121+
"".join(f"{json.dumps(row)}\n" for row in history_rows),
122+
encoding="utf-8",
123+
)
124+
pr = pd.DataFrame(
125+
[
126+
stored_timing_row(
127+
"pr-sha",
128+
"tpch_q01/datafusion:parquet",
129+
115,
130+
"nvme",
131+
{"scale_factor": "1.0"},
132+
),
133+
]
134+
)
135+
136+
selected = compare.read_latest_baseline_rows(history_path, pr)
137+
138+
assert set(selected["commit_id"]) == {"base-current"}
139+
assert len(selected) == 2
140+
141+
36142
def test_within_engine_analysis_uses_each_engines_own_parquet_control() -> None:
37143
compare = load_compare_module()
38144
rows = [

0 commit comments

Comments
 (0)