Fix benchmark workflow bugs

kaitj · kaitj · commit b35fc7609ccf · 2026-05-06T15:45:43.000-04:00
- Switch to shortened SHA for PR
- Add PR for unique output file artifact
- Disable comparison against tag due to lack of dependency group
- Add step to comment on PR
- Sort labels for comment
diff --git a/.github/scripts/compare_benchmarks.py b/.github/scripts/compare_benchmarks.py
@@ -2,10 +2,17 @@
 """Compare benchmark results across PR, main, and tag and output a markdown table."""
 
 import json
+import logging
+import re
 import statistics
 from pathlib import Path
 from typing import Literal, NamedTuple
 
+_logger = logging.getLogger(__name__)
+
+
+ALERT = 250  # Value (arbitrary; in ms) to indicate difference between benchmarks
+
 
 class BenchmarkResult(NamedTuple):
     fullname: str
@@ -64,9 +71,9 @@ def _delta(pr: BenchmarkResult, ref: BenchmarkResult) -> str:
     if ref == 0:
         return "N/A"
     diff = _scale(pr.median - ref.median)
-    pct = (pr.median / ref.median - 1) * 100
-    icon = "🔴" if pct > 5 else "🟢" if pct < -5 else "⚪"
-    return f"{icon} {diff:+.3f} ms ({pct:+.1f}%)"
+    # Indicator for 250ms absolute diff (arbitrary)
+    icon = "🔴" if diff > ALERT else "🟢" if diff < -ALERT else "⚪"
+    return f"{icon} {diff:+.3f}ms"
 
 
 def _label(result: BenchmarkResult) -> str:
@@ -83,10 +90,13 @@ def _label(result: BenchmarkResult) -> str:
 def build_table(
     pr: dict[str, BenchmarkResult],
     main: dict[str, BenchmarkResult],
-    tag: dict[str, BenchmarkResult],
-    tag_name: str,
+    tag: dict[str, BenchmarkResult] = {},
+    tag_name: str | None = None,
 ) -> str:
     all_keys = set(pr) | set(main) | set(tag)
+    all_keys = sorted(
+        all_keys, key=lambda x: (0 if "index" in x else 1 if "query" in x else 2, x)
+    )
     labels = [_label((pr.get(k) or main.get(k) or tag.get(k))) for k in all_keys]
 
     col_sep = " | "
@@ -110,14 +120,14 @@ def delta_row(label: str, ref: dict[str, BenchmarkResult]) -> str:
         divider,
         row("PR", pr),
         row("main", main),
-        row(tag_name, tag),
+        # row(tag_name, tag),
         divider.replace("-", ""),
         delta_row("PR vs main", main),
-        delta_row(f"PR vs {tag_name}", tag),
+        # delta_row(f"PR vs {tag_name}", tag),
         "",
         "> `median (mean ± std)`",
         "> ",
-        "🔴 >5% slower &nbsp; ⚪ within 5% &nbsp; 🟢 >5% faster",
+        f"> 🔴 >{ALERT}ms slower &nbsp; ⚪ within {ALERT}ms &nbsp; 🟢 >{ALERT}ms faster",
     ]
     return "\n".join(lines)
 
@@ -134,27 +144,33 @@ def main():
     parser.add_argument(
         "-o",
         "--output",
+        type=Path,
         help="Output markdown filepath containing benchmark comparisons",
     )
     args = parser.parse_args()
 
     files = sorted(Path(".").glob(args.pattern))
-    assert len(files) == 3, f"Expected 3 files, found {len(files)}: {files}"
+    assert len(files) > 1, "Expected more than 1 file for benchmark comparison."
 
     # Infer pr/main/tag from directory name
     parsed: dict[str, BenchmarkResult] = {}
     tag = None
     for f in files:
-        stem = f.parent.name  # e.g. "benchmark-pr"
-        key = stem.split("-")[-1]  # "pr", "main", tag
-        if key not in ("pr", "main"):
+        stem = f.name  # e.g. "benchmark-pr-PR-#"
+        key = stem.split("-")[1]  # commit-sha, "main", tag
+
+        # Special cases
+        if re.match(r"^v\d+\.\d+.\d+$", key):
             tag = key
+        elif key != "main":
+            key = "pr"
+
         parsed[key] = parse_file(f)
     if tag is None:
-        raise ValueError("Unknown tag")
-    table = build_table(parsed["pr"], parsed["main"], parsed[tag], tag_name=tag)
+        _logger.warning("Tag not found")
+    table = build_table(parsed["pr"], parsed["main"], parsed.get(tag, {}), tag_name=tag)
     args.output.write_text(table)
-    print(table)
+    _logger.info(table)
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/run_benchmarks.py b/.github/scripts/run_benchmarks.py
@@ -14,7 +14,7 @@ def main():
     pytest.main(
         [
             "-m",
-            "benchmark and not cloud",
+            "benchmark",
             "--benchmark-save-data",
             f"--benchmark-json={args.output}",
             "--benchmark-time-unit=ms",
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -4,31 +4,38 @@ on:
   pull_request:
     branches: [ "main" ]
 
+permissions:
+  pull-requests: write
 jobs:
-  get-tag:
+  prep:
     runs-on: ubuntu-latest
     outputs:
       tag: ${{ steps.last_tag.outputs.tag }}
+      short_sha: ${{ steps.short.outputs.sha }}
     steps:
       - uses: actions/checkout@v6
         with:
           fetch-tags: true
           fetch-depth: 0
       - id: last_tag
-        run: echo ="tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+        run: echo "tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
+      - id: short
+        run: echo "sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT
 
   benchmark:
-    needs: get-tag
+    needs: prep
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         target:
-          - name: pr
+          - name: ${{ needs.prep.outputs.short_sha }}
             ref: ${{ github.sha }}
           - name: main
             ref: main
-          - name: ${{ needs.get_tag.outputs.tag }}
-            ref: ${{ needs.get_tag.outputs.tag }}
+          # Tag comparison disabled until next release (missing benchmark dependencies)
+          # - name: ${{ needs.prep.outputs.tag }}
+          #   ref: ${{ needs.prep.outputs.tag }}
     steps:
       - uses: actions/checkout@v6
         with:
@@ -38,24 +45,45 @@ jobs:
       - run: uv sync --extra "cloud"
       - name: Run benchmarks
         run: |
-          uv run .github/scripts/run_benchmarks.py \ 
-            --output benchmark-${{matrix.target.name }}.json
+          FILENAME="benchmark-${{ matrix.target.name }}-PR-${{ github.event.pull_request.number }}.json"
+          uv run .github/scripts/run_benchmarks.py --output "$FILENAME"
+          echo "REPORT_PATH=$FILENAME" >> $GITHUB_ENV
       - uses: actions/upload-artifact@v7
         with:
-          name: benchmark-${{ matrix.target.name }}
-          path: benchmark-${{ matrix.target.name }}.json
+          name: benchmark-${{ matrix.target.name }}-PR-${{
+            github.event.pull_request.number }}
+          path: ${{ env.REPORT_PATH }}
+          retention-days: 1
+          overwrite: true
 
   report:
-    needs: [ get-tag, benchmark ]
+    needs: [ prep, benchmark ]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v6
       - uses: astral-sh/setup-uv@v8.1.0
       - uses: actions/download-artifact@v8
         with:
           pattern: benchmark-*
+          merge-multiple: true
+          path: benchmark-results
       - name: Generate report
         run: |
           uv run .github/scripts/compare_benchmarks.py \
-            --output benchmarks.md \
-            --pattern benchmark-*.json
+            --output "benchmarks.md" \
+            --pattern "benchmark-results/benchmark-*-PR-${{ github.event.pull_request.number }}.json"
+      - name: Find Comment
+        uses: peter-evans/find-comment@v3
+        id: fc
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: "github-actions[bot]"
+          body-includes: "Benchmark Results"
+
+      - name: Create / update comment
+        uses: peter-evans/create-or-update-comment@v5
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body-path: "benchmarks.md"
+          edit-mode: replace

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ def main():`
`14`	`14`	`pytest.main(`
`15`	`15`	`[`
`16`	`16`	`"-m",`
`17`		`- "benchmark and not cloud",`
	`17`	`+ "benchmark",`
`18`	`18`	`"--benchmark-save-data",`
`19`	`19`	`f"--benchmark-json={args.output}",`
`20`	`20`	`"--benchmark-time-unit=ms",`