allenporter · victorigualada · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Feb 27, 2026
diff --git a/.github/ISSUE_TEMPLATE/benchmark-run.yml b/.github/ISSUE_TEMPLATE/benchmark-run.yml
@@ -0,0 +1,33 @@
+---
+name: "Benchmark run (OpenRouter)"
+description: "Request a benchmark run for a single OpenRouter model"
+title: "Benchmark: <author>/<model> (OpenRouter)"
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Submit a benchmark request for a single OpenRouter model.
+
+        - Only org members/collaborators can trigger runs.
+        - A workflow will validate this issue when the `run-benchmark` label is applied.
+
+  - type: textarea
+    id: models
+    attributes:
+      label: Model (OpenRouter URL or slug)
+      description: |
+        Exactly one model. Accepted formats:
+        - https://openrouter.ai/<author>/<model>
+        - <author>/<model>
+      placeholder: |
+        https://openrouter.ai/mistralai/mistral-large-2512
+    validations:
+      required: true
+
+  - type: textarea
+    id: notes
+    attributes:
+      label: Notes (optional)
+      description: Any context for reviewers / maintainers.
+    validations:
+      required: false
diff --git a/.github/workflows/benchmarks-from-issue.yaml b/.github/workflows/benchmarks-from-issue.yaml
@@ -0,0 +1,348 @@
+---
+name: Benchmarks (from issue)
+
+on:
+  issues:
+    types: [labeled]
+
+permissions:
+  contents: write
+  pull-requests: write
+  issues: write
+
+concurrency:
+  group: benchmarks-issue-${{ github.event.issue.number }}
+  cancel-in-progress: false
+
+env:
+  PYTHON_VERSION: "3.14"
+
+jobs:
+  run:
+    if: github.event.label.name == 'run-benchmark'
+    runs-on: ubuntu-latest
+    timeout-minutes: 720
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Authorize request
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+
+          ASSOC="$(jq -r '.issue.author_association // ""' "$GITHUB_EVENT_PATH")"
+          if [ "$ASSOC" != "MEMBER" ] && [ "$ASSOC" != "OWNER" ] && [ "$ASSOC" != "COLLABORATOR" ]; then
+            gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "$(
+              cat <<'EOF'
+          This benchmark request was closed automatically because the issue author is not authorized to trigger benchmark runs.
+
+          If you believe this is an error, please ask a maintainer to re-open and apply the `run-benchmark` label.
+          EOF
+            )"
+            gh issue close "$ISSUE_NUMBER" -R "$REPO" --reason "not planned"
+            exit 0
+          fi
+
+      - name: Parse and validate single model
+        id: model
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+
+          python3 - <<'PY'
+          import json, os, re, sys
+          from urllib.parse import urlparse
+
+          event_path = os.environ["GITHUB_EVENT_PATH"]
+          event = json.load(open(event_path, "r", encoding="utf-8"))
+          body = (event.get("issue") or {}).get("body") or ""
+
+          match = re.search(r"^###\s+Model(?:s)?\b.*?\n(.*?)(?:\n###\s+|\Z)", body, flags=re.M | re.S)
+          if not match:
+              print("Could not find a '### Model' section in the issue body.", file=sys.stderr)
+              sys.exit(2)
+
+          raw_block = match.group(1)
+          lines = []
+          for line in raw_block.splitlines():
+              s = line.strip()
+              if not s:
+                  continue
+              # Remove common markdown bullet prefixes
+              if s.startswith(("-", "*")):
+                  s = s[1:].strip()
+              # Defensive: sometimes users paste literal "\n" or "\r" sequences
+              # into the issue template; strip them so we don't create filenames
+              # like "model-id\\n.yaml".
+              s = s.replace("\\n", "").replace("\\r", "").strip()
+              if not s or s.startswith("#"):
+                  continue
+              lines.append(s)
+
+          if not lines:
+              print("No model found under the '### Model' section.", file=sys.stderr)
+              sys.exit(2)
+          if len(lines) != 1:
+              print(
+                  f"Expected exactly 1 model under '### Model', got {len(lines)}.",
+                  file=sys.stderr,
+              )
+              sys.exit(2)
+
+          def parse_to_slug(value: str) -> str | None:
+              # Defensive cleanup for literal escape sequences sometimes pasted into issues.
+              value = value.replace("\\n", "").replace("\\r", "").strip()
+              if "://" not in value:
+                  # slug
+                  if value.count("/") < 1:
+                      return None
+                  author, model = value.split("/", 1)
+                  author = author.strip()
+                  model = model.strip()
+                  if not author or not model:
+                      return None
+                  return f"{author}/{model}"
+              u = urlparse(value)
+              if u.scheme not in ("http", "https"):
+                  return None
+              if u.netloc not in ("openrouter.ai", "www.openrouter.ai"):
+                  return None
+              parts = [p for p in u.path.split("/") if p]
+              if len(parts) < 2:
+                  return None
+              return f"{parts[0]}/{parts[1]}"
+
+          v = lines[0]
+          slug = parse_to_slug(v)
+          if not slug:
+              print(
+                  f"Invalid model entry: {v!r} (expected openrouter URL or 'author/model')",
+                  file=sys.stderr,
+              )
+              sys.exit(2)
+          slug = slug.replace("\\n", "").replace("\\r", "").strip()
+          model_id = slug.rsplit("/", 1)[-1].replace("\\n", "").replace("\\r", "").strip()
+          if not model_id:
+              print(f"Could not derive model id from slug: {slug!r}", file=sys.stderr)
+              sys.exit(2)
+
+          with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as out:
+              out.write(f"model_slug={slug}\n")
+              out.write(f"model_id={model_id}\n")
+          PY
+
+      - name: Export model vars
+        run: |
+          set -euo pipefail
+          echo "MODEL_SLUG=${{ steps.model.outputs.model_slug }}" >> "$GITHUB_ENV"
+          echo "MODEL_ID=${{ steps.model.outputs.model_id }}" >> "$GITHUB_ENV"
+
+      - name: Close invalid issue
+        if: failure() && steps.model.outcome == 'failure'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+
+          gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "$(
+            cat <<'EOF'
+          This benchmark request was closed automatically because it did not match the required issue template.
+
+          Please ensure the issue contains a `### Model` section with exactly **one** OpenRouter model, as either:
+          - `https://openrouter.ai/<author>/<model>`
+          - `<author>/<model>`
+          EOF
+          )"
+          gh issue close "$ISSUE_NUMBER" -R "$REPO" --reason "not planned"
+          exit 0
+
+      - name: Create work branch
+        id: branch
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+
+          TS="$(date -u +%Y%m%d%H%M%S)"
+          BRANCH="bench/issue-${ISSUE_NUMBER}-${TS}"
+          echo "branch=${BRANCH}" >> "$GITHUB_OUTPUT"
+
+          git checkout -b "${BRANCH}"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements_dev.txt
+            requirements_eval.txt
+          activate-environment: true
+
+      - name: Install dependencies
+        run: |
+          set -euo pipefail
+          uv pip install -r requirements_dev.txt --prerelease=allow
+          uv pip install -r requirements_eval.txt --prerelease=allow
+
+      - name: Prepare allenporter/home-assistant-synthetic-home
+        uses: actions/checkout@v4
+        with:
+          repository: allenporter/home-assistant-synthetic-home
+          path: home-assistant-synthetic-home
+          sparse-checkout: |
+            custom_components
+
+      - name: Write secrets file for !secret
+        env:
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+        run: |
+          set -euo pipefail
+          if [ -z "${OPENROUTER_API_KEY}" ]; then
+            echo "Missing required GitHub secret: OPENROUTER_API_KEY"
+            echo "Set it in the repository's Actions secrets and re-run."
+            exit 1
+          fi
+
+          printf 'openrouter_api_key: "%s"' "$OPENROUTER_API_KEY" > "${RUNNER_TEMP}/secrets.yaml"
+
+          python3 - <<'PY'
+          import os
+          import yaml
+
+          secrets_path = os.path.join(os.environ["RUNNER_TEMP"], "secrets.yaml")
+          with open(secrets_path, "r", encoding="utf-8") as f:
+              data = yaml.safe_load(f) or {}
+
+          key = (data.get("openrouter_api_key") or "").strip()
+          if not key:
+              raise SystemExit(f"secrets.yaml written but openrouter_api_key missing/empty: {secrets_path}")
+
+          print("secrets.yaml OK (openrouter_api_key present)")
+          PY
+
+      - name: Generate model YAMLs from OpenRouter
+        env:
+          SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
+          SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          echo "Fetching model slug: ${MODEL_SLUG}"
+          python3 script/openrouter_fetch_model.py "${MODEL_SLUG}"
+
+          if [ ! -f "models/${MODEL_ID}.yaml" ]; then
+            echo "Expected model config not found: models/${MODEL_ID}.yaml"
+            echo "models/ directory listing:"
+            ls -la models || true
+            exit 1
+          fi
+
+      - name: Run collection (all datasets)
+        env:
+          SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
+          SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
+          MODEL: ${{ env.MODEL_ID }}
+        run: |
+          set -euo pipefail
+          ./script/eval_collect_all_datasets.sh
+          ./script/eval_collect_automations.sh
+
+      - name: Run metrics (assist family + automations)
+        env:
+          SYNTHETIC_HOME_DIR: ${{ github.workspace }}/home-assistant-synthetic-home/
+          SECRETS_FILE: ${{ runner.temp }}/secrets.yaml
+        run: |
+          set -euo pipefail
+          DATASET_NAME=assist ./script/eval_metrics_assist.sh
+          DATASET_NAME=assist-mini ./script/eval_metrics_assist.sh
+          DATASET_NAME=questions ./script/eval_metrics_assist.sh
+          ./script/eval_metrics_automations.sh
+
+      - name: Build leaderboard
+        run: |
+          set -euo pipefail
+          home-assistant-datasets leaderboard build
+
+      - name: Commit and push results
+        id: commit
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          BRANCH="${{ steps.branch.outputs.branch }}"
+          REPO="${{ github.repository }}"
+
+          git status --porcelain
+          git add models/*.yaml reports/** || true
+
+          if git diff --cached --quiet; then
+            echo "has_changes=0" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+
+          git commit -m "Benchmark: issue #${ISSUE_NUMBER} (OpenRouter)"
+          git push -u origin "${BRANCH}"
+
+          echo "has_changes=1" >> "$GITHUB_OUTPUT"
+
+      - name: Create PR and notify
+        if: steps.commit.outputs.has_changes == '1'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+          BRANCH="${{ steps.branch.outputs.branch }}"
+
+          PR_URL="$(gh pr create -R "$REPO" --base main --head "${BRANCH}" \
+            --title "Benchmark: issue #${ISSUE_NUMBER} (OpenRouter)" \
+            --body "$(
+              cat <<EOF
+          Automated benchmark run triggered from issue #${ISSUE_NUMBER}.
+
+          Requested model:
+          - ${MODEL_SLUG}
+          EOF
+            )")"
+
+          gh pr comment "$PR_URL" -R "$REPO" --body "Pushed benchmark results. Ready for review."
+          gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark run finished. Results are in: ${PR_URL}"
+
+      - name: Notify nothing produced
+        if: steps.commit.outputs.has_changes == '0'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+          gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark run finished, but nothing new was produced to commit (no PR created)."
+
+      - name: Notify failure
+        if: failure()
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          ISSUE_NUMBER="${{ github.event.issue.number }}"
+          REPO="${{ github.repository }}"
+
+          # Best-effort: comment on issue, do not close (maintainers may want to inspect logs)
+          gh issue comment "$ISSUE_NUMBER" -R "$REPO" --body "Benchmark workflow failed. Please see Actions logs for details."