Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions .github/scripts/select_behavioral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env -S uv run --quiet
# /// script
# requires-python = ">=3.10"
# dependencies = []
# ///
"""Select which behavioral tests to run, by skill name.

Behavioral tests live one-per-skill (see CONTRIBUTING.md) at:

eval/behavioral/tests/test_<skill_with_underscores>.py

and exercise the matching skill under skills/<skill>/. Skill names are
lowercase-with-hyphens; the test filename swaps the hyphens for underscores
(``local-ai-use`` -> ``test_local_ai_use.py``) because that is what Python
import / pytest collection require.

This script maps a set of changed files (read from stdin, one path per line)
to the skills whose behavioral test should run, and is also used to enumerate
every testable skill for manual / full runs.

Output is always a JSON array of skill names on stdout, suitable for a GitHub
Actions matrix:

uv run .github/scripts/select_behavioral.py --all
uv run .github/scripts/select_behavioral.py --names "local-ai-use,rocm-doctor"
git diff --name-only BASE HEAD | uv run .github/scripts/select_behavioral.py --changed

A skill is "testable" only when both its test file and its skill folder exist;
that keeps the matrix honest if a test is added before its skill (or vice
versa).
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent.parent
TESTS_DIR = REPO_ROOT / "eval" / "behavioral" / "tests"
SKILLS_DIR = REPO_ROOT / "skills"

TEST_PREFIX = "test_"
TEST_SUFFIX = ".py"

# Touching any of these means the shared harness (not one skill) changed, so we
# re-run every behavioral test rather than trying to guess the blast radius.
# Paths are repo-root-relative and use forward slashes to match `git diff`.
INFRA_FILES = {
"eval/behavioral/harness.py",
"eval/behavioral/conftest.py",
"eval/behavioral/pytest.ini",
"eval/behavioral/requirements.txt",
"eval/claude_eval.py",
".github/scripts/select_behavioral.py",
".github/workflows/behavioral.yml",
}


def skill_to_test(skill: str) -> str:
"""`local-ai-use` -> `test_local_ai_use.py`."""
return f"{TEST_PREFIX}{skill.replace('-', '_')}{TEST_SUFFIX}"


def test_to_skill(filename: str) -> str:
"""`test_local_ai_use.py` -> `local-ai-use` (inverse of skill_to_test)."""
stem = filename[len(TEST_PREFIX) : -len(TEST_SUFFIX)]
return stem.replace("_", "-")


def is_testable(skill: str) -> bool:
"""A skill is testable when both its test file and skill folder exist."""
has_test = (TESTS_DIR / skill_to_test(skill)).is_file()
has_skill = (SKILLS_DIR / skill / "SKILL.md").is_file()
return has_test and has_skill


def all_testable_skills() -> list[str]:
"""Every skill that currently has a behavioral test and a skill folder."""
if not TESTS_DIR.is_dir():
return []
skills = set()
for path in TESTS_DIR.glob(f"{TEST_PREFIX}*{TEST_SUFFIX}"):
skill = test_to_skill(path.name)
if is_testable(skill):
skills.add(skill)
return sorted(skills)


def select_from_changes(changed: list[str]) -> list[str]:
"""Map changed file paths to the testable skills they affect."""
normalized = {p.strip().replace("\\", "/") for p in changed if p.strip()}

# Shared-harness change: run the whole suite.
if normalized & INFRA_FILES:
return all_testable_skills()

selected = set()
for path in normalized:
# A change inside skills/<name>/...
if path.startswith("skills/"):
parts = path.split("/")
if len(parts) >= 2 and is_testable(parts[1]):
selected.add(parts[1])
# A change to a behavioral test file itself.
if path.startswith("eval/behavioral/tests/") and path.endswith(TEST_SUFFIX):
name = Path(path).name
if name.startswith(TEST_PREFIX):
skill = test_to_skill(name)
if is_testable(skill):
selected.add(skill)
return sorted(selected)


def select_from_names(names: str) -> list[str]:
"""Filter an explicit, comma-separated skill list down to testable ones."""
requested = [n.strip() for n in names.split(",") if n.strip()]
return sorted({n for n in requested if is_testable(n)})


def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__)
mode = parser.add_mutually_exclusive_group(required=True)
mode.add_argument(
"--all",
action="store_true",
help="Print every skill that has a behavioral test.",
)
mode.add_argument(
"--changed",
action="store_true",
help="Read changed file paths from stdin and print the affected skills.",
)
mode.add_argument(
"--names",
metavar="A,B,C",
help="Print the testable subset of this comma-separated skill list.",
)
args = parser.parse_args(argv)

if args.all:
skills = all_testable_skills()
elif args.names is not None:
skills = select_from_names(args.names)
else:
skills = select_from_changes(sys.stdin.read().splitlines())

print(json.dumps(skills))
return 0


if __name__ == "__main__":
raise SystemExit(main())
199 changes: 199 additions & 0 deletions .github/workflows/behavioral.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
name: behavioral

# Behavioral tests run a real agent against a skill and grade what it did (see
# eval/behavioral/). They cost real API tokens and, for some skills, install
# and exercise local models, so the actual test job is opt-in. The design:
#
# * selective -- only the skills whose folder or test changed are run (the
# whole suite runs when the shared harness changes). See
# .github/scripts/select_behavioral.py.
# * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
# only runs on manual dispatch or when a maintainer adds the
# `run_behavioral` label, keeping the secret away from untrusted / fork
# code that runs with tool permissions bypassed.
# * required when relevant -- when a PR changes a skill or test that maps to a
# behavioral test, the `behavioral` gate FAILS until the label is added and
# the tests pass. A PR that touches nothing testable passes neutrally.
# * dispatchable -- run any subset by hand from the Actions tab.
#
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
# branch protection can require just the `behavioral` check. `discover` is
# secret-free, so it runs on every matching PR to decide whether the label is
# required; only `behavioral` is gated on the label.

on:
pull_request:
types: [opened, synchronize, reopened, labeled]
paths:
- "skills/**"
- "eval/behavioral/**"
- "eval/claude_eval.py"
- ".github/workflows/behavioral.yml"
- ".github/scripts/select_behavioral.py"
workflow_dispatch:
inputs:
skills:
description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
required: false
default: ""

concurrency:
group: behavioral-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

permissions:
contents: read

env:
BEHAVIORAL_LABEL: run_behavioral

jobs:
# Decide which skills the change affects. This is secret-free (just git diff +
# a Python mapping), so it runs on every matching PR regardless of the label;
# the label only gates the test job below. Its `any` output drives whether the
# label is required for this PR.
discover:
name: Select behavioral tests
runs-on: ubuntu-latest
outputs:
skills: ${{ steps.select.outputs.skills }}
any: ${{ steps.select.outputs.any }}
steps:
- name: Check out repository
uses: actions/checkout@v4
with:
# Need the merge base so `git diff` can see what the PR changed.
fetch-depth: 0

- name: Set up uv
uses: astral-sh/setup-uv@v7

- name: Select skills
id: select
run: |
set -euo pipefail
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
if [ -n "${{ github.event.inputs.skills }}" ]; then
skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
else
skills=$(uv run .github/scripts/select_behavioral.py --all)
fi
else
base="${{ github.event.pull_request.base.sha }}"
head="${{ github.event.pull_request.head.sha }}"
skills=$(git diff --name-only "$base" "$head" \
| uv run .github/scripts/select_behavioral.py --changed)
fi
echo "Selected skills: $skills"
echo "skills=$skills" >> "$GITHUB_OUTPUT"
if [ "$skills" = "[]" ]; then
echo "any=false" >> "$GITHUB_OUTPUT"
else
echo "any=true" >> "$GITHUB_OUTPUT"
fi

behavioral:
name: Behavioral (${{ matrix.skill }})
needs: discover
# Run only when something testable changed AND the run is authorized:
# manual dispatch, or a maintainer added the `run_behavioral` label. This is
# the gate that protects the ANTHROPIC_API_KEY secret.
if: >-
needs.discover.outputs.any == 'true' &&
(github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
runs-on: ubuntu-latest
# Behavioral runs install local models and can take a while; cap it so a
# hung agent or stalled model pull fails the job instead of burning minutes.
timeout-minutes: 45
strategy:
# One skill failing should not hide the others' results.
fail-fast: false
matrix:
skill: ${{ fromJson(needs.discover.outputs.skills) }}
steps:
- name: Check out repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Set up Node
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install the claude CLI
run: npm install -g @anthropic-ai/claude-code

- name: Install behavioral test dependencies
run: pip install -r eval/behavioral/requirements.txt

- name: Run behavioral test for ${{ matrix.skill }}
working-directory: eval/behavioral
env:
# The CLI authenticates from this key. This job only runs on labeled
# PRs and manual dispatch (see this job's `if:` above).
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# Lets the harness default to this skill if a test relies on the env.
BEHAVIORAL_SKILL: ${{ matrix.skill }}
run: |
set -euo pipefail
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
echo "Running $test_file"
pytest "$test_file"

# Single aggregate gate. Mark THIS check required in branch protection.
#
# * nothing testable changed -> pass (neutral).
# * testable change, label missing -> FAIL, asking for the label.
# * testable change, authorized -> pass iff the behavioral job passed.
behavioral-gate:
name: behavioral
needs: [discover, behavioral]
if: always()
runs-on: ubuntu-latest
env:
DISCOVER_RESULT: ${{ needs.discover.result }}
BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
AFFECTED: ${{ needs.discover.outputs.any }}
SKILLS: ${{ needs.discover.outputs.skills }}
# 'true' only on a PR that carries the label; '' / 'false' otherwise.
LABEL_PRESENT: ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }}
steps:
- name: Verify behavioral results
run: |
echo "discover: $DISCOVER_RESULT"
echo "behavioral: $BEHAVIORAL_RESULT"
echo "affected: $AFFECTED ($SKILLS)"
echo "label: $LABEL_PRESENT"

# If discovery itself failed, surface that rather than guessing.
if [ "$DISCOVER_RESULT" != "success" ]; then
echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
exit 1
fi

# No skill or behavioral test changed: nothing to gate on.
if [ "$AFFECTED" != "true" ]; then
echo "No behavioral tests affected by this change."
exit 0
fi

# Something testable changed. Manual dispatch and labeled PRs are
# authorized to run the tests, so the gate reflects the test result.
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
echo "All affected behavioral tests passed."
exit 0
fi
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
exit 1
fi

# Testable change on a PR with no label: require it.
echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
exit 1
28 changes: 28 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,34 @@ Test the skill the way users will hit it:
2. Run the skill end-to-end on a real machine. Watch where the agent hesitates, asks unnecessary questions, or goes off-script.
3. Bring those observations back into the skill, usually as a sharper description, a clearer default, or a missing prerequisite, rather than adding more prose.

## Behavioral tests

Structural validation proves a skill is *well-formed*; behavioral tests prove it
*works*. A behavioral test runs a real agent against the skill once and grades
what the agent did — see the harness in [`eval/behavioral/`](eval/behavioral/).

Conventions:

- **One file per skill, centralized.** Put the test at
`eval/behavioral/tests/test_<skill>.py`, swapping the skill name's hyphens for
underscores (`local-ai-use` → `test_local_ai_use.py`). Tests live here, not
inside `skills/<name>/`, because the harness copies the skill folder into the
agent's sandbox at runtime — test files in there would pollute the workspace.
- **Write checks against behavior.** Combine deterministic assertions
(`logs_contains`, `workspace_contains`) with LLM-judged expectations
(`should`, `should_not`). See `test_local_ai_use.py` for the pattern.

Run one locally (needs the `claude` CLI authenticated and any per-skill
prerequisites, e.g. a reachable Lemonade Server for `local-ai-use`):

```bash
pip install -r eval/behavioral/requirements.txt
cd eval/behavioral && pytest tests/test_local_ai_use.py
```

In CI, the `behavioral` workflow runs these tests, but **only** when a
maintainer adds the `run_behavioral` label to a PR for safety.

## Pre-publish checklist

- [ ] Description states the user's goal and includes likely trigger phrases
Expand Down
Loading