Use a per-benchmarks baseline instead of the last fully succesful run (#8332)

AdamGS · web-flow · commit 0a41704faccb · 2026-06-10T13:05:46.000+01:00
## Summary

Currently - when one post-merge benchmarks fails for any reason, the
baseline we use is the last fully successful run. This PR changes it to
be the last successful run of that specific benchmark.

Tested this works by running it in such a state and visually verifying
the results.

---------

Signed-off-by: Adam Gutglick &lt;adam@spiraldb.com&gt;
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
@@ -98,16 +98,8 @@ jobs:
         run: |
           set -Eeu -o pipefail -x
 
-          base_commit_sha=$(\
-            curl -L \
-              -H "Accept: application/vnd.github+json" \
-              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
-            | jq -r '.workflow_runs[].head_sha' \
-          )
-
           python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
-          gzip -d -c data.json.gz | grep $base_commit_sha > base.json
+          gzip -d -c data.json.gz > base.json
 
           echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
           echo '' >> comment.md
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -443,16 +443,8 @@ jobs:
         run: |
           set -Eeu -o pipefail -x
 
-          base_commit_sha=$(\
-            curl -L \
-              -H "Accept: application/vnd.github+json" \
-              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
-            | jq -r '.workflow_runs[].head_sha' \
-          )
-
           python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/data.json.gz data.json.gz --no-sign-request
-          gzip -d -c data.json.gz | grep $base_commit_sha > base.json
+          gzip -d -c data.json.gz > base.json
 
           echo '# Benchmarks: ${{ matrix.name }}' > comment.md
           echo '' >> comment.md
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -4,6 +4,7 @@
 #   "numpy",
 #   "pandas",
 #   "tabulate",
+#   "orjson"
 # ]
 # ///
 
@@ -18,6 +19,7 @@
 from typing import Any
 
 import numpy as np
+import orjson
 import pandas as pd
 
 # Analysis overview:
@@ -59,9 +61,7 @@ def extract_dataset_key(df: pd.DataFrame) -> pd.DataFrame:
     if "dataset" not in df.columns:
         df["dataset_key"] = pd.NA
     else:
-        df["dataset_key"] = df["dataset"].apply(
-            lambda x: str(sorted(x.items())) if pd.notna(x) and isinstance(x, dict) else pd.NA
-        )
+        df["dataset_key"] = df["dataset"].apply(dataset_key)
     return df
 
 
@@ -77,6 +77,126 @@ def split_file_size_rows(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     return df[mask].copy(), df[~mask].copy()
 
 
+def identity_value(value: Any) -> Any:
+    """Normalize missing values so benchmark identities compare reliably."""
+
+    return None if pd.isna(value) else value
+
+
+def dataset_key(value: Any) -> str | None:
+    """Normalize dataset metadata into the join-key representation."""
+
+    if isinstance(value, dict):
+        return str(sorted(value.items()))
+    return None
+
+
+def benchmark_identity(row: Any) -> tuple[Any, Any, Any] | None:
+    """Return the timing-row identity used to find a matching baseline."""
+
+    if row.get("metric") == FILE_SIZE_METRIC or row.get("file_size") is not None:
+        return None
+
+    name = row.get("name")
+    if name is None:
+        return None
+
+    return (
+        identity_value(name),
+        identity_value(row.get("storage")),
+        dataset_key(row.get("dataset")),
+    )
+
+
+def benchmark_identity_rows(df: pd.DataFrame) -> pd.DataFrame:
+    """Return timing rows with the identity used to match a PR benchmark."""
+
+    _file_size_rows, timing_rows = split_file_size_rows(df)
+    if timing_rows.empty or "name" not in timing_rows.columns:
+        return pd.DataFrame(columns=["commit_id", "benchmark_identity"])
+
+    timing_rows = timing_rows.copy()
+    if "storage" not in timing_rows.columns:
+        timing_rows["storage"] = pd.NA
+    if "commit_id" not in timing_rows.columns:
+        timing_rows["commit_id"] = pd.NA
+
+    timing_rows = extract_dataset_key(timing_rows)
+    timing_rows["benchmark_identity"] = [
+        tuple(identity_value(row[column]) for column in ("name", "storage", "dataset_key"))
+        for _, row in timing_rows.iterrows()
+    ]
+
+    return timing_rows[["commit_id", "benchmark_identity"]]
+
+
+def read_jsonl_rows_for_commit(path: str, commit_id: str) -> pd.DataFrame:
+    """Read only rows matching a commit from a JSONL benchmark history."""
+
+    rows = []
+    with open(path, encoding="utf-8") as lines:
+        for line in lines:
+            if '"commit_id"' not in line or f'"{commit_id}"' not in line:
+                continue
+            record = orjson.loads(line)
+            if record.get("commit_id") == commit_id:
+                rows.append(record)
+    return pd.DataFrame(rows)
+
+
+def read_latest_baseline_rows(path: str, pr: pd.DataFrame) -> pd.DataFrame:
+    """Read rows from the latest history commit matching the PR benchmark."""
+
+    pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
+    if not pr_identities:
+        return pd.read_json(path, lines=True)
+
+    baseline_commit_id = None
+    with open(path, encoding="utf-8") as lines:
+        for line in lines:
+            if '"name"' not in line or '"commit_id"' not in line:
+                continue
+            record = orjson.loads(line)
+            if benchmark_identity(record) in pr_identities:
+                commit_id = record.get("commit_id")
+                if commit_id is not None:
+                    baseline_commit_id = commit_id
+
+    if baseline_commit_id is None:
+        raise ValueError("No baseline rows found for the benchmark under test")
+
+    return read_jsonl_rows_for_commit(path, baseline_commit_id)
+
+
+def select_latest_baseline_rows(base: pd.DataFrame, pr: pd.DataFrame) -> pd.DataFrame:
+    """Select rows from the latest baseline commit containing this benchmark.
+
+    The persisted benchmark history is append-only. A row only appears after
+    that benchmark job uploaded results, so the newest commit with matching row
+    identities is the latest successful baseline for the benchmark under test.
+    """
+
+    if base.empty or "commit_id" not in base.columns:
+        return base
+
+    commit_ids = base["commit_id"].dropna().unique()
+    if len(commit_ids) <= 1:
+        return base
+
+    pr_identities = set(benchmark_identity_rows(pr)["benchmark_identity"])
+    if not pr_identities:
+        return base
+
+    base_identities = benchmark_identity_rows(base)
+    matches = base_identities[base_identities["benchmark_identity"].isin(pr_identities)]
+    matches = matches[matches["commit_id"].notna()]
+    if matches.empty:
+        raise ValueError("No baseline rows found for the benchmark under test")
+
+    baseline_commit_id = matches["commit_id"].iloc[-1]
+    return base[base["commit_id"] == baseline_commit_id].copy()
+
+
 def extract_target_fields(name: str) -> pd.Series:
     """Parse query, engine, and format from the benchmark name."""
 
@@ -702,8 +822,8 @@ def main() -> None:
 
     benchmark_name = sys.argv[3] if len(sys.argv) > 3 else ""
 
-    base = pd.read_json(sys.argv[1], lines=True)
     pr = pd.read_json(sys.argv[2], lines=True)
+    base = read_latest_baseline_rows(sys.argv[1], pr)
 
     base_commit_id = set(base["commit_id"].unique())
     pr_commit_id = set(pr["commit_id"].unique())
diff --git a/scripts/tests/test_benchmark_reporting.py b/scripts/tests/test_benchmark_reporting.py
@@ -33,6 +33,112 @@ def timing_row(name: str, base: int, pr: int) -> dict[str, object]:
     }
 
 
+def stored_timing_row(
+    commit: str,
+    name: str,
+    value: int,
+    storage: str | None = None,
+    dataset: dict[str, object] | None = None,
+) -> dict[str, object]:
+    row: dict[str, object] = {
+        "name": name,
+        "unit": "ns",
+        "value": value,
+        "all_runtimes": [value, value, value],
+        "commit_id": commit,
+    }
+    if storage is not None:
+        row["storage"] = storage
+    if dataset is not None:
+        row["dataset"] = dataset
+    return row
+
+
+def test_select_latest_baseline_rows_uses_latest_matching_benchmark_commit() -> None:
+    compare = load_compare_module()
+    history = pd.DataFrame(
+        [
+            stored_timing_row(
+                "base-old",
+                "tpch_q01/datafusion:parquet",
+                100,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+            file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+            stored_timing_row(
+                "base-current",
+                "tpch_q01/datafusion:parquet",
+                110,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+            file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+            stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
+        ]
+    )
+    pr = pd.DataFrame(
+        [
+            stored_timing_row(
+                "pr-sha",
+                "tpch_q01/datafusion:parquet",
+                115,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+        ]
+    )
+
+    selected = compare.select_latest_baseline_rows(history, pr)
+
+    assert set(selected["commit_id"]) == {"base-current"}
+    assert len(selected) == 2
+
+
+def test_read_latest_baseline_rows_streams_latest_matching_benchmark_commit(tmp_path: Path) -> None:
+    compare = load_compare_module()
+    history_path = tmp_path / "history.jsonl"
+    history_rows = [
+        stored_timing_row(
+            "base-old",
+            "tpch_q01/datafusion:parquet",
+            100,
+            "nvme",
+            {"scale_factor": "1.0"},
+        ),
+        file_size_record_for("base-old", 100, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+        stored_timing_row(
+            "base-current",
+            "tpch_q01/datafusion:parquet",
+            110,
+            "nvme",
+            {"scale_factor": "1.0"},
+        ),
+        file_size_record_for("base-current", 120, "tpch", "1.0", "vortex-file-compressed", "part-0.vortex"),
+        stored_timing_row("base-other", "clickbench_q01/datafusion:parquet", 200, "nvme"),
+    ]
+    history_path.write_text(
+        "".join(f"{json.dumps(row)}\n" for row in history_rows),
+        encoding="utf-8",
+    )
+    pr = pd.DataFrame(
+        [
+            stored_timing_row(
+                "pr-sha",
+                "tpch_q01/datafusion:parquet",
+                115,
+                "nvme",
+                {"scale_factor": "1.0"},
+            ),
+        ]
+    )
+
+    selected = compare.read_latest_baseline_rows(history_path, pr)
+
+    assert set(selected["commit_id"]) == {"base-current"}
+    assert len(selected) == 2
+
+
 def test_within_engine_analysis_uses_each_engines_own_parquet_control() -> None:
     compare = load_compare_module()
     rows = [