Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/ISSUE_TEMPLATE/benchmark-run.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
name: "Benchmark run (OpenRouter)"
description: "Request a benchmark run for a single OpenRouter model"
title: "Benchmark: <author>/<model> (OpenRouter)"
body:
- type: markdown
attributes:
value: |
Submit a benchmark request for a single OpenRouter model.

- Only org members/collaborators can trigger runs.
- A workflow will validate this issue when the `run-benchmark` label is applied.

- type: textarea
id: models
attributes:
label: Model (OpenRouter URL or slug)
description: |
Exactly one model. Accepted formats:
- https://openrouter.ai/<author>/<model>
- <author>/<model>
placeholder: |
https://openrouter.ai/mistralai/mistral-large-2512
validations:
required: true

- type: textarea
id: notes
attributes:
label: Notes (optional)
description: Any context for reviewers / maintainers.
validations:
required: false
348 changes: 348 additions & 0 deletions .github/workflows/benchmarks-from-issue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
---
name: Benchmarks (from issue)

on:
issues:
types: [labeled]

permissions:
contents: write
pull-requests: write
issues: write

concurrency:
group: benchmarks-issue-${{ github.event.issue.number }}
cancel-in-progress: false

env:
PYTHON_VERSION: "3.14"

jobs:
run:
if: github.event.label.name == 'run-benchmark'
runs-on: ubuntu-latest
timeout-minutes: 720

steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Authorize request
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"

ASSOC="$(jq -r '.issue.author_association // ""' "$GITHUB_EVENT_PATH")"
if [ "$ASSOC" != "MEMBER" ] && [ "$ASSOC" != "OWNER" ] && [ "$ASSOC" != "COLLABORATOR" ]; then
gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "$(
cat <<'EOF'
This benchmark request was closed automatically because the issue author is not authorized to trigger benchmark runs.

If you believe this is an error, please ask a maintainer to re-open and apply the `run-benchmark` label.
EOF
)"
gh issue close "$ISSUE_NUMBER" -R "$REPO" --reason "not planned"
exit 0
fi

- name: Parse and validate single model
id: model
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"

python3 - <<'PY'
import json, os, re, sys
from urllib.parse import urlparse

event_path = os.environ["GITHUB_EVENT_PATH"]
event = json.load(open(event_path, "r", encoding="utf-8"))
body = (event.get("issue") or {}).get("body") or ""

match = re.search(r"^###\s+Model(?:s)?\b.*?\n(.*?)(?:\n###\s+|\Z)", body, flags=re.M | re.S)
if not match:
print("Could not find a '### Model' section in the issue body.", file=sys.stderr)
sys.exit(2)

raw_block = match.group(1)
lines = []
for line in raw_block.splitlines():
s = line.strip()
if not s:
continue
# Remove common markdown bullet prefixes
if s.startswith(("-", "*")):
s = s[1:].strip()
# Defensive: sometimes users paste literal "\n" or "\r" sequences
# into the issue template; strip them so we don't create filenames
# like "model-id\\n.yaml".
s = s.replace("\\n", "").replace("\\r", "").strip()
if not s or s.startswith("#"):
continue
lines.append(s)

if not lines:
print("No model found under the '### Model' section.", file=sys.stderr)
sys.exit(2)
if len(lines) != 1:
print(
f"Expected exactly 1 model under '### Model', got {len(lines)}.",
file=sys.stderr,
)
sys.exit(2)

def parse_to_slug(value: str) -> str | None:
# Defensive cleanup for literal escape sequences sometimes pasted into issues.
value = value.replace("\\n", "").replace("\\r", "").strip()
if "://" not in value:
# slug
if value.count("/") < 1:
return None
author, model = value.split("/", 1)
author = author.strip()
model = model.strip()
if not author or not model:
return None
return f"{author}/{model}"
u = urlparse(value)
if u.scheme not in ("http", "https"):
return None
if u.netloc not in ("openrouter.ai", "www.openrouter.ai"):
return None
parts = [p for p in u.path.split("/") if p]
if len(parts) < 2:
return None
return f"{parts[0]}/{parts[1]}"

v = lines[0]
slug = parse_to_slug(v)
if not slug:
print(
f"Invalid model entry: {v!r} (expected openrouter URL or 'author/model')",
file=sys.stderr,
)
sys.exit(2)
slug = slug.replace("\\n", "").replace("\\r", "").strip()
model_id = slug.rsplit("/", 1)[-1].replace("\\n", "").replace("\\r", "").strip()
if not model_id:
print(f"Could not derive model id from slug: {slug!r}", file=sys.stderr)
sys.exit(2)

with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as out:
out.write(f"model_slug={slug}\n")
out.write(f"model_id={model_id}\n")
PY

- name: Export model vars
run: |
set -euo pipefail
echo "MODEL_SLUG=${{ steps.model.outputs.model_slug }}" >> "$GITHUB_ENV"
echo "MODEL_ID=${{ steps.model.outputs.model_id }}" >> "$GITHUB_ENV"

- name: Close invalid issue
if: failure() && steps.model.outcome == 'failure'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"

gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "$(
cat <<'EOF'
This benchmark request was closed automatically because it did not match the required issue template.

Please ensure the issue contains a `### Model` section with exactly **one** OpenRouter model, as either:
- `https://openrouter.ai/<author>/<model>`
- `<author>/<model>`
EOF
)"
gh issue close "$ISSUE_NUMBER" -R "$REPO" --reason "not planned"
exit 0

- name: Create work branch
id: branch
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"

TS="$(date -u +%Y%m%d%H%M%S)"
BRANCH="bench/issue-${ISSUE_NUMBER}-${TS}"
echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT"

git checkout -b "${BRANCH}"

- name: Install uv
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ env.PYTHON_VERSION }}
enable-cache: true
cache-dependency-glob: |
requirements_dev.txt
requirements_eval.txt
activate-environment: true

- name: Install dependencies
run: |
set -euo pipefail
uv pip install -r requirements_dev.txt --prerelease=allow
uv pip install -r requirements_eval.txt --prerelease=allow

- name: Prepare allenporter/home-assistant-synthetic-home
uses: actions/checkout@v4
with:
repository: allenporter/home-assistant-synthetic-home
path: home-assistant-synthetic-home
sparse-checkout: |
custom_components

- name: Write secrets file for !secret
env:
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
run: |
set -euo pipefail
if [ -z "${OPENROUTER_API_KEY}" ]; then
echo "Missing required GitHub secret: OPENROUTER_API_KEY"
echo "Set it in the repository's Actions secrets and re-run."
exit 1
fi

printf 'openrouter_api_key: "%s"' "$OPENROUTER_API_KEY" > "${RUNNER_TEMP}/secrets.yaml"

python3 - <<'PY'
import os
import yaml

secrets_path = os.path.join(os.environ["RUNNER_TEMP"], "secrets.yaml")
with open(secrets_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}

key = (data.get("openrouter_api_key") or "").strip()
if not key:
raise SystemExit(f"secrets.yaml written but openrouter_api_key missing/empty: {secrets_path}")

print("secrets.yaml OK (openrouter_api_key present)")
PY

- name: Generate model YAMLs from OpenRouter
env:
SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
echo "Fetching model slug: ${MODEL_SLUG}"
python3 script/openrouter_fetch_model.py "${MODEL_SLUG}"

if [ ! -f "models/${MODEL_ID}.yaml" ]; then
echo "Expected model config not found: models/${MODEL_ID}.yaml"
echo "models/ directory listing:"
ls -la models || true
exit 1
fi

- name: Run collection (all datasets)
env:
SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
MODEL: ${{ env.MODEL_ID }}
run: |
set -euo pipefail
./script/eval_collect_all_datasets.sh
./script/eval_collect_automations.sh

- name: Run metrics (assist family + automations)
env:
SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
run: |
set -euo pipefail
DATASET_NAME=assist ./script/eval_metrics_assist.sh
DATASET_NAME=assist-mini ./script/eval_metrics_assist.sh
DATASET_NAME=questions ./script/eval_metrics_assist.sh
./script/eval_metrics_automations.sh

- name: Build leaderboard
run: |
set -euo pipefail
home-assistant-datasets leaderboard build

- name: Commit and push results
id: commit
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
BRANCH="${{ steps.branch.outputs.branch }}"
REPO="${{ github.repository }}"

git status --porcelain
git add models/*.yaml reports/** || true

if git diff --cached --quiet; then
echo "has_changes=0" >> "$GITHUB_OUTPUT"
exit 0
fi

git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"

git commit -m "Benchmark: issue #${ISSUE_NUMBER} (OpenRouter)"
git push -u origin "${BRANCH}"

echo "has_changes=1" >> "$GITHUB_OUTPUT"

- name: Create PR and notify
if: steps.commit.outputs.has_changes == '1'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"
BRANCH="${{ steps.branch.outputs.branch }}"

PR_URL="$(gh pr create -R "$REPO" --base main --head "${BRANCH}" \
--title "Benchmark: issue #${ISSUE_NUMBER} (OpenRouter)" \
--body "$(
cat <<EOF
Automated benchmark run triggered from issue #${ISSUE_NUMBER}.

Requested model:
- ${MODEL_SLUG}
EOF
)")"

gh pr comment "$PR_URL" -R "$REPO" --body "Pushed benchmark results. Ready for review."
gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark run finished. Results are in: ${PR_URL}"

- name: Notify nothing produced
if: steps.commit.outputs.has_changes == '0'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"
gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark run finished, but nothing new was produced to commit (no PR created)."

- name: Notify failure
if: failure()
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
ISSUE_NUMBER="${{ github.event.issue.number }}"
REPO="${{ github.repository }}"

# Best-effort: comment on issue, do not close (maintainers may want to inspect logs)
gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark workflow failed. Please see Actions logs for details."
Loading
Loading