diff --git a/.github/scripts/select_behavioral.py b/.github/scripts/select_behavioral.py new file mode 100644 index 0000000..7292022 --- /dev/null +++ b/.github/scripts/select_behavioral.py @@ -0,0 +1,154 @@ +#!/usr/bin/env -S uv run --quiet +# /// script +# requires-python = ">=3.10" +# dependencies = [] +# /// +"""Select which behavioral tests to run, by skill name. + +Behavioral tests live one-per-skill (see CONTRIBUTING.md) at: + + eval/behavioral/tests/test_.py + +and exercise the matching skill under skills//. Skill names are +lowercase-with-hyphens; the test filename swaps the hyphens for underscores +(``local-ai-use`` -> ``test_local_ai_use.py``) because that is what Python +import / pytest collection require. + +This script maps a set of changed files (read from stdin, one path per line) +to the skills whose behavioral test should run, and is also used to enumerate +every testable skill for manual / full runs. + +Output is always a JSON array of skill names on stdout, suitable for a GitHub +Actions matrix: + + uv run .github/scripts/select_behavioral.py --all + uv run .github/scripts/select_behavioral.py --names "local-ai-use,rocm-doctor" + git diff --name-only BASE HEAD | uv run .github/scripts/select_behavioral.py --changed + +A skill is "testable" only when both its test file and its skill folder exist; +that keeps the matrix honest if a test is added before its skill (or vice +versa). +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +TESTS_DIR = REPO_ROOT / "eval" / "behavioral" / "tests" +SKILLS_DIR = REPO_ROOT / "skills" + +TEST_PREFIX = "test_" +TEST_SUFFIX = ".py" + +# Touching any of these means the shared harness (not one skill) changed, so we +# re-run every behavioral test rather than trying to guess the blast radius. +# Paths are repo-root-relative and use forward slashes to match `git diff`. +INFRA_FILES = { + "eval/behavioral/harness.py", + "eval/behavioral/conftest.py", + "eval/behavioral/pytest.ini", + "eval/behavioral/requirements.txt", + "eval/claude_eval.py", + ".github/scripts/select_behavioral.py", + ".github/workflows/behavioral.yml", +} + + +def skill_to_test(skill: str) -> str: + """`local-ai-use` -> `test_local_ai_use.py`.""" + return f"{TEST_PREFIX}{skill.replace('-', '_')}{TEST_SUFFIX}" + + +def test_to_skill(filename: str) -> str: + """`test_local_ai_use.py` -> `local-ai-use` (inverse of skill_to_test).""" + stem = filename[len(TEST_PREFIX) : -len(TEST_SUFFIX)] + return stem.replace("_", "-") + + +def is_testable(skill: str) -> bool: + """A skill is testable when both its test file and skill folder exist.""" + has_test = (TESTS_DIR / skill_to_test(skill)).is_file() + has_skill = (SKILLS_DIR / skill / "SKILL.md").is_file() + return has_test and has_skill + + +def all_testable_skills() -> list[str]: + """Every skill that currently has a behavioral test and a skill folder.""" + if not TESTS_DIR.is_dir(): + return [] + skills = set() + for path in TESTS_DIR.glob(f"{TEST_PREFIX}*{TEST_SUFFIX}"): + skill = test_to_skill(path.name) + if is_testable(skill): + skills.add(skill) + return sorted(skills) + + +def select_from_changes(changed: list[str]) -> list[str]: + """Map changed file paths to the testable skills they affect.""" + normalized = {p.strip().replace("\\", "/") for p in changed if p.strip()} + + # Shared-harness change: run the whole suite. + if normalized & INFRA_FILES: + return all_testable_skills() + + selected = set() + for path in normalized: + # A change inside skills//... + if path.startswith("skills/"): + parts = path.split("/") + if len(parts) >= 2 and is_testable(parts[1]): + selected.add(parts[1]) + # A change to a behavioral test file itself. + if path.startswith("eval/behavioral/tests/") and path.endswith(TEST_SUFFIX): + name = Path(path).name + if name.startswith(TEST_PREFIX): + skill = test_to_skill(name) + if is_testable(skill): + selected.add(skill) + return sorted(selected) + + +def select_from_names(names: str) -> list[str]: + """Filter an explicit, comma-separated skill list down to testable ones.""" + requested = [n.strip() for n in names.split(",") if n.strip()] + return sorted({n for n in requested if is_testable(n)}) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + mode = parser.add_mutually_exclusive_group(required=True) + mode.add_argument( + "--all", + action="store_true", + help="Print every skill that has a behavioral test.", + ) + mode.add_argument( + "--changed", + action="store_true", + help="Read changed file paths from stdin and print the affected skills.", + ) + mode.add_argument( + "--names", + metavar="A,B,C", + help="Print the testable subset of this comma-separated skill list.", + ) + args = parser.parse_args(argv) + + if args.all: + skills = all_testable_skills() + elif args.names is not None: + skills = select_from_names(args.names) + else: + skills = select_from_changes(sys.stdin.read().splitlines()) + + print(json.dumps(skills)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/workflows/behavioral.yml b/.github/workflows/behavioral.yml new file mode 100644 index 0000000..824f584 --- /dev/null +++ b/.github/workflows/behavioral.yml @@ -0,0 +1,199 @@ +name: behavioral + +# Behavioral tests run a real agent against a skill and grade what it did (see +# eval/behavioral/). They cost real API tokens and, for some skills, install +# and exercise local models, so the actual test job is opt-in. The design: +# +# * selective -- only the skills whose folder or test changed are run (the +# whole suite runs when the shared harness changes). See +# .github/scripts/select_behavioral.py. +# * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY) +# only runs on manual dispatch or when a maintainer adds the +# `run_behavioral` label, keeping the secret away from untrusted / fork +# code that runs with tool permissions bypassed. +# * required when relevant -- when a PR changes a skill or test that maps to a +# behavioral test, the `behavioral` gate FAILS until the label is added and +# the tests pass. A PR that touches nothing testable passes neutrally. +# * dispatchable -- run any subset by hand from the Actions tab. +# +# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so +# branch protection can require just the `behavioral` check. `discover` is +# secret-free, so it runs on every matching PR to decide whether the label is +# required; only `behavioral` is gated on the label. + +on: + pull_request: + types: [opened, synchronize, reopened, labeled] + paths: + - "skills/**" + - "eval/behavioral/**" + - "eval/claude_eval.py" + - ".github/workflows/behavioral.yml" + - ".github/scripts/select_behavioral.py" + workflow_dispatch: + inputs: + skills: + description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)." + required: false + default: "" + +concurrency: + group: behavioral-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + BEHAVIORAL_LABEL: run_behavioral + +jobs: + # Decide which skills the change affects. This is secret-free (just git diff + + # a Python mapping), so it runs on every matching PR regardless of the label; + # the label only gates the test job below. Its `any` output drives whether the + # label is required for this PR. + discover: + name: Select behavioral tests + runs-on: ubuntu-latest + outputs: + skills: ${{ steps.select.outputs.skills }} + any: ${{ steps.select.outputs.any }} + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + # Need the merge base so `git diff` can see what the PR changed. + fetch-depth: 0 + + - name: Set up uv + uses: astral-sh/setup-uv@v7 + + - name: Select skills + id: select + run: | + set -euo pipefail + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + if [ -n "${{ github.event.inputs.skills }}" ]; then + skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}") + else + skills=$(uv run .github/scripts/select_behavioral.py --all) + fi + else + base="${{ github.event.pull_request.base.sha }}" + head="${{ github.event.pull_request.head.sha }}" + skills=$(git diff --name-only "$base" "$head" \ + | uv run .github/scripts/select_behavioral.py --changed) + fi + echo "Selected skills: $skills" + echo "skills=$skills" >> "$GITHUB_OUTPUT" + if [ "$skills" = "[]" ]; then + echo "any=false" >> "$GITHUB_OUTPUT" + else + echo "any=true" >> "$GITHUB_OUTPUT" + fi + + behavioral: + name: Behavioral (${{ matrix.skill }}) + needs: discover + # Run only when something testable changed AND the run is authorized: + # manual dispatch, or a maintainer added the `run_behavioral` label. This is + # the gate that protects the ANTHROPIC_API_KEY secret. + if: >- + needs.discover.outputs.any == 'true' && + (github.event_name == 'workflow_dispatch' || + contains(github.event.pull_request.labels.*.name, 'run_behavioral')) + runs-on: ubuntu-latest + # Behavioral runs install local models and can take a while; cap it so a + # hung agent or stalled model pull fails the job instead of burning minutes. + timeout-minutes: 45 + strategy: + # One skill failing should not hide the others' results. + fail-fast: false + matrix: + skill: ${{ fromJson(needs.discover.outputs.skills) }} + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install the claude CLI + run: npm install -g @anthropic-ai/claude-code + + - name: Install behavioral test dependencies + run: pip install -r eval/behavioral/requirements.txt + + - name: Run behavioral test for ${{ matrix.skill }} + working-directory: eval/behavioral + env: + # The CLI authenticates from this key. This job only runs on labeled + # PRs and manual dispatch (see this job's `if:` above). + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + # Lets the harness default to this skill if a test relies on the env. + BEHAVIORAL_SKILL: ${{ matrix.skill }} + run: | + set -euo pipefail + test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py" + echo "Running $test_file" + pytest "$test_file" + + # Single aggregate gate. Mark THIS check required in branch protection. + # + # * nothing testable changed -> pass (neutral). + # * testable change, label missing -> FAIL, asking for the label. + # * testable change, authorized -> pass iff the behavioral job passed. + behavioral-gate: + name: behavioral + needs: [discover, behavioral] + if: always() + runs-on: ubuntu-latest + env: + DISCOVER_RESULT: ${{ needs.discover.result }} + BEHAVIORAL_RESULT: ${{ needs.behavioral.result }} + AFFECTED: ${{ needs.discover.outputs.any }} + SKILLS: ${{ needs.discover.outputs.skills }} + # 'true' only on a PR that carries the label; '' / 'false' otherwise. + LABEL_PRESENT: ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }} + steps: + - name: Verify behavioral results + run: | + echo "discover: $DISCOVER_RESULT" + echo "behavioral: $BEHAVIORAL_RESULT" + echo "affected: $AFFECTED ($SKILLS)" + echo "label: $LABEL_PRESENT" + + # If discovery itself failed, surface that rather than guessing. + if [ "$DISCOVER_RESULT" != "success" ]; then + echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2 + exit 1 + fi + + # No skill or behavioral test changed: nothing to gate on. + if [ "$AFFECTED" != "true" ]; then + echo "No behavioral tests affected by this change." + exit 0 + fi + + # Something testable changed. Manual dispatch and labeled PRs are + # authorized to run the tests, so the gate reflects the test result. + if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then + if [ "$BEHAVIORAL_RESULT" = "success" ]; then + echo "All affected behavioral tests passed." + exit 0 + fi + echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2 + exit 1 + fi + + # Testable change on a PR with no label: require it. + echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2 + echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2 + exit 1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5e2e38c..891c13a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -196,6 +196,34 @@ Test the skill the way users will hit it: 2. Run the skill end-to-end on a real machine. Watch where the agent hesitates, asks unnecessary questions, or goes off-script. 3. Bring those observations back into the skill, usually as a sharper description, a clearer default, or a missing prerequisite, rather than adding more prose. +## Behavioral tests + +Structural validation proves a skill is *well-formed*; behavioral tests prove it +*works*. A behavioral test runs a real agent against the skill once and grades +what the agent did — see the harness in [`eval/behavioral/`](eval/behavioral/). + +Conventions: + +- **One file per skill, centralized.** Put the test at + `eval/behavioral/tests/test_.py`, swapping the skill name's hyphens for + underscores (`local-ai-use` → `test_local_ai_use.py`). Tests live here, not + inside `skills//`, because the harness copies the skill folder into the + agent's sandbox at runtime — test files in there would pollute the workspace. +- **Write checks against behavior.** Combine deterministic assertions + (`logs_contains`, `workspace_contains`) with LLM-judged expectations + (`should`, `should_not`). See `test_local_ai_use.py` for the pattern. + +Run one locally (needs the `claude` CLI authenticated and any per-skill +prerequisites, e.g. a reachable Lemonade Server for `local-ai-use`): + +```bash +pip install -r eval/behavioral/requirements.txt +cd eval/behavioral && pytest tests/test_local_ai_use.py +``` + +In CI, the `behavioral` workflow runs these tests, but **only** when a +maintainer adds the `run_behavioral` label to a PR for safety. + ## Pre-publish checklist - [ ] Description states the user's goal and includes likely trigger phrases