Add behavioral tests to github workflows

danielholanda · danielholanda · commit c662296618a2 · 2026-06-17T15:37:05.000-07:00
diff --git a/.github/scripts/select_behavioral.py b/.github/scripts/select_behavioral.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env -S uv run --quiet
+# /// script
+# requires-python = ">=3.10"
+# dependencies = []
+# ///
+"""Select which behavioral tests to run, by skill name.
+
+Behavioral tests live one-per-skill (see CONTRIBUTING.md) at:
+
+    eval/behavioral/tests/test_<skill_with_underscores>.py
+
+and exercise the matching skill under skills/<skill>/. Skill names are
+lowercase-with-hyphens; the test filename swaps the hyphens for underscores
+(``local-ai-use`` -> ``test_local_ai_use.py``) because that is what Python
+import / pytest collection require.
+
+This script maps a set of changed files (read from stdin, one path per line)
+to the skills whose behavioral test should run, and is also used to enumerate
+every testable skill for manual / full runs.
+
+Output is always a JSON array of skill names on stdout, suitable for a GitHub
+Actions matrix:
+
+    uv run .github/scripts/select_behavioral.py --all
+    uv run .github/scripts/select_behavioral.py --names "local-ai-use,rocm-doctor"
+    git diff --name-only BASE HEAD | uv run .github/scripts/select_behavioral.py --changed
+
+A skill is "testable" only when both its test file and its skill folder exist;
+that keeps the matrix honest if a test is added before its skill (or vice
+versa).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+TESTS_DIR = REPO_ROOT / "eval" / "behavioral" / "tests"
+SKILLS_DIR = REPO_ROOT / "skills"
+
+TEST_PREFIX = "test_"
+TEST_SUFFIX = ".py"
+
+# Touching any of these means the shared harness (not one skill) changed, so we
+# re-run every behavioral test rather than trying to guess the blast radius.
+# Paths are repo-root-relative and use forward slashes to match `git diff`.
+INFRA_FILES = {
+    "eval/behavioral/harness.py",
+    "eval/behavioral/conftest.py",
+    "eval/behavioral/pytest.ini",
+    "eval/behavioral/requirements.txt",
+    "eval/claude_eval.py",
+    ".github/scripts/select_behavioral.py",
+    ".github/workflows/behavioral.yml",
+}
+
+
+def skill_to_test(skill: str) -> str:
+    """`local-ai-use` -> `test_local_ai_use.py`."""
+    return f"{TEST_PREFIX}{skill.replace('-', '_')}{TEST_SUFFIX}"
+
+
+def test_to_skill(filename: str) -> str:
+    """`test_local_ai_use.py` -> `local-ai-use` (inverse of skill_to_test)."""
+    stem = filename[len(TEST_PREFIX) : -len(TEST_SUFFIX)]
+    return stem.replace("_", "-")
+
+
+def is_testable(skill: str) -> bool:
+    """A skill is testable when both its test file and skill folder exist."""
+    has_test = (TESTS_DIR / skill_to_test(skill)).is_file()
+    has_skill = (SKILLS_DIR / skill / "SKILL.md").is_file()
+    return has_test and has_skill
+
+
+def all_testable_skills() -> list[str]:
+    """Every skill that currently has a behavioral test and a skill folder."""
+    if not TESTS_DIR.is_dir():
+        return []
+    skills = set()
+    for path in TESTS_DIR.glob(f"{TEST_PREFIX}*{TEST_SUFFIX}"):
+        skill = test_to_skill(path.name)
+        if is_testable(skill):
+            skills.add(skill)
+    return sorted(skills)
+
+
+def select_from_changes(changed: list[str]) -> list[str]:
+    """Map changed file paths to the testable skills they affect."""
+    normalized = {p.strip().replace("\\", "/") for p in changed if p.strip()}
+
+    # Shared-harness change: run the whole suite.
+    if normalized & INFRA_FILES:
+        return all_testable_skills()
+
+    selected = set()
+    for path in normalized:
+        # A change inside skills/<name>/...
+        if path.startswith("skills/"):
+            parts = path.split("/")
+            if len(parts) >= 2 and is_testable(parts[1]):
+                selected.add(parts[1])
+        # A change to a behavioral test file itself.
+        if path.startswith("eval/behavioral/tests/") and path.endswith(TEST_SUFFIX):
+            name = Path(path).name
+            if name.startswith(TEST_PREFIX):
+                skill = test_to_skill(name)
+                if is_testable(skill):
+                    selected.add(skill)
+    return sorted(selected)
+
+
+def select_from_names(names: str) -> list[str]:
+    """Filter an explicit, comma-separated skill list down to testable ones."""
+    requested = [n.strip() for n in names.split(",") if n.strip()]
+    return sorted({n for n in requested if is_testable(n)})
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    mode = parser.add_mutually_exclusive_group(required=True)
+    mode.add_argument(
+        "--all",
+        action="store_true",
+        help="Print every skill that has a behavioral test.",
+    )
+    mode.add_argument(
+        "--changed",
+        action="store_true",
+        help="Read changed file paths from stdin and print the affected skills.",
+    )
+    mode.add_argument(
+        "--names",
+        metavar="A,B,C",
+        help="Print the testable subset of this comma-separated skill list.",
+    )
+    args = parser.parse_args(argv)
+
+    if args.all:
+        skills = all_testable_skills()
+    elif args.names is not None:
+        skills = select_from_names(args.names)
+    else:
+        skills = select_from_changes(sys.stdin.read().splitlines())
+
+    print(json.dumps(skills))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/.github/workflows/behavioral.yml b/.github/workflows/behavioral.yml
@@ -0,0 +1,171 @@
+name: behavioral
+
+# Behavioral tests run a real agent against a skill and grade what it did (see
+# eval/behavioral/). They cost real API tokens and, for some skills, install
+# and exercise local models, so they are NOT part of the always-on PR gate.
+# Instead they are:
+#
+#   * label-gated on PRs -- a maintainer adds the `run-behavioral` label to opt
+#     a PR in. This keeps the ANTHROPIC_API_KEY secret away from untrusted /
+#     fork code, which runs with tool permissions bypassed.
+#   * selective -- only the skills whose folder or test changed are run (the
+#     whole suite runs when the shared harness changes). See
+#     .github/scripts/select_behavioral.py.
+#   * dispatchable -- run any subset by hand from the Actions tab.
+#
+# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
+# branch protection can require just the `behavioral` check. That gate passes
+# (neutral) when the label is absent, so it never blocks an unlabeled PR.
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+    paths:
+      - "skills/**"
+      - "eval/behavioral/**"
+      - "eval/claude_eval.py"
+      - ".github/workflows/behavioral.yml"
+      - ".github/scripts/select_behavioral.py"
+  workflow_dispatch:
+    inputs:
+      skills:
+        description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
+        required: false
+        default: ""
+
+concurrency:
+  group: behavioral-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  BEHAVIORAL_LABEL: run-behavioral
+
+jobs:
+  # Decide which skills to run. On PRs this is gated on the `run-behavioral`
+  # label so the secret is never exposed to code that a maintainer hasn't
+  # vouched for.
+  discover:
+    name: Select behavioral tests
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' &&
+       contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
+    runs-on: ubuntu-latest
+    outputs:
+      skills: ${{ steps.select.outputs.skills }}
+      any: ${{ steps.select.outputs.any }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          # Need the merge base so `git diff` can see what the PR changed.
+          fetch-depth: 0
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Select skills
+        id: select
+        run: |
+          set -euo pipefail
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            if [ -n "${{ github.event.inputs.skills }}" ]; then
+              skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
+            else
+              skills=$(uv run .github/scripts/select_behavioral.py --all)
+            fi
+          else
+            base="${{ github.event.pull_request.base.sha }}"
+            head="${{ github.event.pull_request.head.sha }}"
+            skills=$(git diff --name-only "$base" "$head" \
+              | uv run .github/scripts/select_behavioral.py --changed)
+          fi
+          echo "Selected skills: $skills"
+          echo "skills=$skills" >> "$GITHUB_OUTPUT"
+          if [ "$skills" = "[]" ]; then
+            echo "any=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "any=true" >> "$GITHUB_OUTPUT"
+          fi
+
+  behavioral:
+    name: Behavioral (${{ matrix.skill }})
+    needs: discover
+    if: needs.discover.outputs.any == 'true'
+    runs-on: ubuntu-latest
+    # Behavioral runs install local models and can take a while; cap it so a
+    # hung agent or stalled model pull fails the job instead of burning minutes.
+    timeout-minutes: 45
+    strategy:
+      # One skill failing should not hide the others' results.
+      fail-fast: false
+      matrix:
+        skill: ${{ fromJson(needs.discover.outputs.skills) }}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install the claude CLI
+        run: npm install -g @anthropic-ai/claude-code
+
+      - name: Install behavioral test dependencies
+        run: pip install -r eval/behavioral/requirements.txt
+
+      - name: Run behavioral test for ${{ matrix.skill }}
+        working-directory: eval/behavioral
+        env:
+          # The CLI authenticates from this key; it is only present on labeled
+          # PRs and manual dispatch (see the `discover` gate above).
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          # Lets the harness default to this skill if a test relies on the env.
+          BEHAVIORAL_SKILL: ${{ matrix.skill }}
+        run: |
+          set -euo pipefail
+          test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
+          echo "Running $test_file"
+          pytest "$test_file"
+
+  # Single aggregate gate. Mark THIS check required in branch protection.
+  # It passes when behavioral tests were not requested (no label) and when
+  # every selected behavioral job succeeded, so it never blocks an unlabeled PR.
+  behavioral-gate:
+    name: behavioral
+    needs: [discover, behavioral]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Verify behavioral results
+        run: |
+          discover_result="${{ needs.discover.result }}"
+          behavioral_result="${{ needs.behavioral.result }}"
+          echo "discover:   $discover_result"
+          echo "behavioral: $behavioral_result"
+
+          # Label absent (or dispatch skipped): behavioral tests were not
+          # requested for this run, so the gate is a no-op pass.
+          if [ "$discover_result" = "skipped" ]; then
+            echo "Behavioral tests not requested (no '${BEHAVIORAL_LABEL}' label)."
+            exit 0
+          fi
+
+          # Nothing matched the change set, or everything that ran passed.
+          if [ "$behavioral_result" = "success" ] || [ "$behavioral_result" = "skipped" ]; then
+            echo "All requested behavioral tests passed."
+            exit 0
+          fi
+
+          echo "One or more behavioral tests failed." >&2
+          exit 1
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -196,6 +196,34 @@ Test the skill the way users will hit it:
 2. Run the skill end-to-end on a real machine. Watch where the agent hesitates, asks unnecessary questions, or goes off-script.
 3. Bring those observations back into the skill, usually as a sharper description, a clearer default, or a missing prerequisite, rather than adding more prose.
 
+## Behavioral tests
+
+Structural validation proves a skill is *well-formed*; behavioral tests prove it
+*works*. A behavioral test runs a real agent against the skill once and grades
+what the agent did — see the harness in [`eval/behavioral/`](eval/behavioral/).
+
+Conventions:
+
+- **One file per skill, centralized.** Put the test at
+  `eval/behavioral/tests/test_<skill>.py`, swapping the skill name's hyphens for
+  underscores (`local-ai-use` → `test_local_ai_use.py`). Tests live here, not
+  inside `skills/<name>/`, because the harness copies the skill folder into the
+  agent's sandbox at runtime — test files in there would pollute the workspace.
+- **Write checks against behavior.** Combine deterministic assertions
+  (`logs_contains`, `workspace_contains`) with LLM-judged expectations
+  (`should`, `should_not`). See `test_local_ai_use.py` for the pattern.
+
+Run one locally (needs the `claude` CLI authenticated and any per-skill
+prerequisites, e.g. a reachable Lemonade Server for `local-ai-use`):
+
+```bash
+pip install -r eval/behavioral/requirements.txt
+cd eval/behavioral && pytest tests/test_local_ai_use.py
+```
+
+In CI, the `behavioral` workflow runs these tests, but **only** when a
+maintainer adds the `run-behavioral` label to a PR for safety.
+
 ## Pre-publish checklist
 
 - [ ] Description states the user's goal and includes likely trigger phrases