Use a per-benchmarks baseline instead of the last fully succesful run

AdamGS · AdamGS · commit d0b888174fa3 · 2026-06-10T12:27:57.000+01:00
Signed-off-by: Adam Gutglick &lt;adam@spiraldb.com&gt;
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
@@ -98,16 +98,8 @@ jobs:
         run: |
           set -Eeu -o pipefail -x
 
-          base_commit_sha=$(\
-            curl -L \
-              -H "Accept: application/vnd.github+json" \
-              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
-            | jq -r '.workflow_runs[].head_sha' \
-          )
-
           python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
-          gzip -d -c data.json.gz | grep $base_commit_sha > base.json
+          gzip -d -c data.json.gz > base.json
 
           echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
           echo '' >> comment.md
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -443,16 +443,8 @@ jobs:
         run: |
           set -Eeu -o pipefail -x
 
-          base_commit_sha=$(\
-            curl -L \
-              -H "Accept: application/vnd.github+json" \
-              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
-            | jq -r '.workflow_runs[].head_sha' \
-          )
-
           python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
-          gzip -d -c data.json.gz | grep $base_commit_sha > base.json
+          gzip -d -c data.json.gz > base.json
 
           echo '# Benchmarks: ${{ matrix.name }}' > comment.md
           echo '' >> comment.md
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -77,6 +77,63 @@ def split_file_size_rows(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     return df[mask].copy(), df[~mask].copy()
 
 
+def identity_value(value: Any) -> Any:
+    """Normalize missing values so benchmark identities compare reliably."""
+
+    return None if pd.isna(value) else value
+
+
+def benchmark_identity_rows(df: pd.DataFrame) -> pd.DataFrame:
+    """Return timing rows with the identity used to match a PR benchmark."""
+
+    _file_size_rows, timing_rows = split_file_size_rows(df)
+    if timing_rows.empty or "name" not in timing_rows.columns:
+        return pd.DataFrame(columns=["commit_id", "benchmark_identity"])
+
+    timing_rows = timing_rows.copy()
+    if "storage" not in timing_rows.columns:
+        timing_rows["storage"] = pd.NA
+    if "commit_id" not in timing_rows.columns:
+        timing_rows["commit_id"] = pd.NA
+
+    timing_rows = extract_dataset_key(timing_rows)
+    timing_rows["benchmark_identity"] = [
+        tuple(identity_value(row[column]) for column in ("name", "storage", "dataset_key"))
+        for _, row in timing_rows.iterrows()
+    ]
+
+    return timing_rows[["commit_id", "benchmark_identity"]]
+
+
+def select_latest_baseline_rows(base: pd.DataFrame, pr: pd.DataFrame) -> pd.DataFrame:
+    """Select rows from the latest baseline commit containing this benchmark.
+
+    The persisted benchmark history is append-only. A row only appears after
+    that benchmark job uploaded results, so the newest commit with matching row
+    identities is the latest successful baseline for the benchmark under test.
+    """
+
+    if base.empty or "commit_id" not in base.columns:
+        return base
+
+    commit_ids = base["commit_id"].dropna().unique()
+    if len(commit_ids) <= 1:
+        return base
+
+    pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
+    if not pr_identities:
+        return base
+
+    base_identities = benchmark_identity_rows(base)
+    matches = base_identities[base_identities["benchmark_identity"].isin(pr_identities)]
+    matches = matches[matches["commit_id"].notna()]
+    if matches.empty:
+        raise ValueError("No baseline rows found for the benchmark under test")
+
+    baseline_commit_id = matches["commit_id"].iloc[-1]
+    return base[base["commit_id"] == baseline_commit_id].copy()
+
+
 def extract_target_fields(name: str) -> pd.Series:
     """Parse query, engine, and format from the benchmark name."""
 
@@ -704,6 +761,7 @@ def main() -> None:
 
     base = pd.read_json(sys.argv[1], lines=True)
     pr = pd.read_json(sys.argv[2], lines=True)
+    base = select_latest_baseline_rows(base, pr)
 
     base_commit_id = set(base["commit_id"].unique())
     pr_commit_id = set(pr["commit_id"].unique())
diff --git a/scripts/tests/test_benchmark_reporting.py b/scripts/tests/test_benchmark_reporting.py
@@ -33,6 +33,68 @@ def timing_row(name: str, base: int, pr: int) -> dict[str, object]:
     }
 
 
+def stored_timing_row(
+    commit: str,
+    name: str,
+    value: int,
+    storage: str | None = None,
+    dataset: dict[str, object] | None = None,
+) -> dict[str, object]:
+    row: dict[str, object] = {
+        "name": name,
+        "unit": "ns",
+        "value": value,
+        "all_runtimes": [value, value, value],
+        "commit_id": commit,
+    }
+    if storage is not None:
+        row["storage"] = storage
+    if dataset is not None:
+        row["dataset"] = dataset
+    return row
+
+
+def test_select_latest_baseline_rows_uses_latest_matching_benchmark_commit() -> None:
+    compare = load_compare_module()
+    history = pd.DataFrame(
+        [
+            stored_timing_row(
+                "base-old",
+                "tpch_q01/datafusion:parquet",
+                100,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+            file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+            stored_timing_row(
+                "base-current",
+                "tpch_q01/datafusion:parquet",
+                110,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+            file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+            stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
+        ]
+    )
+    pr = pd.DataFrame(
+        [
+            stored_timing_row(
+                "pr-sha",
+                "tpch_q01/datafusion:parquet",
+                115,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+        ]
+    )
+
+    selected = compare.select_latest_baseline_rows(history, pr)
+
+    assert set(selected["commit_id"]) == {"base-current"}
+    assert len(selected) == 2
+
+
 def test_within_engine_analysis_uses_each_engines_own_parquet_control() -> None:
     compare = load_compare_module()
     rows = [