Skip to content

Commit c662296

Browse files
committed
Add behavioral tests to github workflows
1 parent 188d534 commit c662296

3 files changed

Lines changed: 353 additions & 0 deletions

File tree

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#!/usr/bin/env -S uv run --quiet
2+
# /// script
3+
# requires-python = ">=3.10"
4+
# dependencies = []
5+
# ///
6+
"""Select which behavioral tests to run, by skill name.
7+
8+
Behavioral tests live one-per-skill (see CONTRIBUTING.md) at:
9+
10+
eval/behavioral/tests/test_<skill_with_underscores>.py
11+
12+
and exercise the matching skill under skills/<skill>/. Skill names are
13+
lowercase-with-hyphens; the test filename swaps the hyphens for underscores
14+
(``local-ai-use`` -> ``test_local_ai_use.py``) because that is what Python
15+
import / pytest collection require.
16+
17+
This script maps a set of changed files (read from stdin, one path per line)
18+
to the skills whose behavioral test should run, and is also used to enumerate
19+
every testable skill for manual / full runs.
20+
21+
Output is always a JSON array of skill names on stdout, suitable for a GitHub
22+
Actions matrix:
23+
24+
uv run .github/scripts/select_behavioral.py --all
25+
uv run .github/scripts/select_behavioral.py --names "local-ai-use,rocm-doctor"
26+
git diff --name-only BASE HEAD | uv run .github/scripts/select_behavioral.py --changed
27+
28+
A skill is "testable" only when both its test file and its skill folder exist;
29+
that keeps the matrix honest if a test is added before its skill (or vice
30+
versa).
31+
"""
32+
33+
from __future__ import annotations
34+
35+
import argparse
36+
import json
37+
import sys
38+
from pathlib import Path
39+
40+
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
41+
TESTS_DIR = REPO_ROOT / "eval" / "behavioral" / "tests"
42+
SKILLS_DIR = REPO_ROOT / "skills"
43+
44+
TEST_PREFIX = "test_"
45+
TEST_SUFFIX = ".py"
46+
47+
# Touching any of these means the shared harness (not one skill) changed, so we
48+
# re-run every behavioral test rather than trying to guess the blast radius.
49+
# Paths are repo-root-relative and use forward slashes to match `git diff`.
50+
INFRA_FILES = {
51+
"eval/behavioral/harness.py",
52+
"eval/behavioral/conftest.py",
53+
"eval/behavioral/pytest.ini",
54+
"eval/behavioral/requirements.txt",
55+
"eval/claude_eval.py",
56+
".github/scripts/select_behavioral.py",
57+
".github/workflows/behavioral.yml",
58+
}
59+
60+
61+
def skill_to_test(skill: str) -> str:
62+
"""`local-ai-use` -> `test_local_ai_use.py`."""
63+
return f"{TEST_PREFIX}{skill.replace('-', '_')}{TEST_SUFFIX}"
64+
65+
66+
def test_to_skill(filename: str) -> str:
67+
"""`test_local_ai_use.py` -> `local-ai-use` (inverse of skill_to_test)."""
68+
stem = filename[len(TEST_PREFIX) : -len(TEST_SUFFIX)]
69+
return stem.replace("_", "-")
70+
71+
72+
def is_testable(skill: str) -> bool:
73+
"""A skill is testable when both its test file and skill folder exist."""
74+
has_test = (TESTS_DIR / skill_to_test(skill)).is_file()
75+
has_skill = (SKILLS_DIR / skill / "SKILL.md").is_file()
76+
return has_test and has_skill
77+
78+
79+
def all_testable_skills() -> list[str]:
80+
"""Every skill that currently has a behavioral test and a skill folder."""
81+
if not TESTS_DIR.is_dir():
82+
return []
83+
skills = set()
84+
for path in TESTS_DIR.glob(f"{TEST_PREFIX}*{TEST_SUFFIX}"):
85+
skill = test_to_skill(path.name)
86+
if is_testable(skill):
87+
skills.add(skill)
88+
return sorted(skills)
89+
90+
91+
def select_from_changes(changed: list[str]) -> list[str]:
92+
"""Map changed file paths to the testable skills they affect."""
93+
normalized = {p.strip().replace("\\", "/") for p in changed if p.strip()}
94+
95+
# Shared-harness change: run the whole suite.
96+
if normalized & INFRA_FILES:
97+
return all_testable_skills()
98+
99+
selected = set()
100+
for path in normalized:
101+
# A change inside skills/<name>/...
102+
if path.startswith("skills/"):
103+
parts = path.split("/")
104+
if len(parts) >= 2 and is_testable(parts[1]):
105+
selected.add(parts[1])
106+
# A change to a behavioral test file itself.
107+
if path.startswith("eval/behavioral/tests/") and path.endswith(TEST_SUFFIX):
108+
name = Path(path).name
109+
if name.startswith(TEST_PREFIX):
110+
skill = test_to_skill(name)
111+
if is_testable(skill):
112+
selected.add(skill)
113+
return sorted(selected)
114+
115+
116+
def select_from_names(names: str) -> list[str]:
117+
"""Filter an explicit, comma-separated skill list down to testable ones."""
118+
requested = [n.strip() for n in names.split(",") if n.strip()]
119+
return sorted({n for n in requested if is_testable(n)})
120+
121+
122+
def main(argv: list[str] | None = None) -> int:
123+
parser = argparse.ArgumentParser(description=__doc__)
124+
mode = parser.add_mutually_exclusive_group(required=True)
125+
mode.add_argument(
126+
"--all",
127+
action="store_true",
128+
help="Print every skill that has a behavioral test.",
129+
)
130+
mode.add_argument(
131+
"--changed",
132+
action="store_true",
133+
help="Read changed file paths from stdin and print the affected skills.",
134+
)
135+
mode.add_argument(
136+
"--names",
137+
metavar="A,B,C",
138+
help="Print the testable subset of this comma-separated skill list.",
139+
)
140+
args = parser.parse_args(argv)
141+
142+
if args.all:
143+
skills = all_testable_skills()
144+
elif args.names is not None:
145+
skills = select_from_names(args.names)
146+
else:
147+
skills = select_from_changes(sys.stdin.read().splitlines())
148+
149+
print(json.dumps(skills))
150+
return 0
151+
152+
153+
if __name__ == "__main__":
154+
raise SystemExit(main())

.github/workflows/behavioral.yml

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
name: behavioral
2+
3+
# Behavioral tests run a real agent against a skill and grade what it did (see
4+
# eval/behavioral/). They cost real API tokens and, for some skills, install
5+
# and exercise local models, so they are NOT part of the always-on PR gate.
6+
# Instead they are:
7+
#
8+
# * label-gated on PRs -- a maintainer adds the `run-behavioral` label to opt
9+
# a PR in. This keeps the ANTHROPIC_API_KEY secret away from untrusted /
10+
# fork code, which runs with tool permissions bypassed.
11+
# * selective -- only the skills whose folder or test changed are run (the
12+
# whole suite runs when the shared harness changes). See
13+
# .github/scripts/select_behavioral.py.
14+
# * dispatchable -- run any subset by hand from the Actions tab.
15+
#
16+
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
17+
# branch protection can require just the `behavioral` check. That gate passes
18+
# (neutral) when the label is absent, so it never blocks an unlabeled PR.
19+
20+
on:
21+
pull_request:
22+
types: [opened, synchronize, reopened, labeled]
23+
paths:
24+
- "skills/**"
25+
- "eval/behavioral/**"
26+
- "eval/claude_eval.py"
27+
- ".github/workflows/behavioral.yml"
28+
- ".github/scripts/select_behavioral.py"
29+
workflow_dispatch:
30+
inputs:
31+
skills:
32+
description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
33+
required: false
34+
default: ""
35+
36+
concurrency:
37+
group: behavioral-${{ github.event.pull_request.number || github.ref }}
38+
cancel-in-progress: true
39+
40+
permissions:
41+
contents: read
42+
43+
env:
44+
BEHAVIORAL_LABEL: run-behavioral
45+
46+
jobs:
47+
# Decide which skills to run. On PRs this is gated on the `run-behavioral`
48+
# label so the secret is never exposed to code that a maintainer hasn't
49+
# vouched for.
50+
discover:
51+
name: Select behavioral tests
52+
if: >-
53+
github.event_name == 'workflow_dispatch' ||
54+
(github.event_name == 'pull_request' &&
55+
contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
56+
runs-on: ubuntu-latest
57+
outputs:
58+
skills: ${{ steps.select.outputs.skills }}
59+
any: ${{ steps.select.outputs.any }}
60+
steps:
61+
- name: Check out repository
62+
uses: actions/checkout@v4
63+
with:
64+
# Need the merge base so `git diff` can see what the PR changed.
65+
fetch-depth: 0
66+
67+
- name: Set up uv
68+
uses: astral-sh/setup-uv@v7
69+
70+
- name: Select skills
71+
id: select
72+
run: |
73+
set -euo pipefail
74+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
75+
if [ -n "${{ github.event.inputs.skills }}" ]; then
76+
skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
77+
else
78+
skills=$(uv run .github/scripts/select_behavioral.py --all)
79+
fi
80+
else
81+
base="${{ github.event.pull_request.base.sha }}"
82+
head="${{ github.event.pull_request.head.sha }}"
83+
skills=$(git diff --name-only "$base" "$head" \
84+
| uv run .github/scripts/select_behavioral.py --changed)
85+
fi
86+
echo "Selected skills: $skills"
87+
echo "skills=$skills" >> "$GITHUB_OUTPUT"
88+
if [ "$skills" = "[]" ]; then
89+
echo "any=false" >> "$GITHUB_OUTPUT"
90+
else
91+
echo "any=true" >> "$GITHUB_OUTPUT"
92+
fi
93+
94+
behavioral:
95+
name: Behavioral (${{ matrix.skill }})
96+
needs: discover
97+
if: needs.discover.outputs.any == 'true'
98+
runs-on: ubuntu-latest
99+
# Behavioral runs install local models and can take a while; cap it so a
100+
# hung agent or stalled model pull fails the job instead of burning minutes.
101+
timeout-minutes: 45
102+
strategy:
103+
# One skill failing should not hide the others' results.
104+
fail-fast: false
105+
matrix:
106+
skill: ${{ fromJson(needs.discover.outputs.skills) }}
107+
steps:
108+
- name: Check out repository
109+
uses: actions/checkout@v4
110+
111+
- name: Set up Python
112+
uses: actions/setup-python@v5
113+
with:
114+
python-version: "3.12"
115+
116+
- name: Set up Node
117+
uses: actions/setup-node@v4
118+
with:
119+
node-version: "20"
120+
121+
- name: Install the claude CLI
122+
run: npm install -g @anthropic-ai/claude-code
123+
124+
- name: Install behavioral test dependencies
125+
run: pip install -r eval/behavioral/requirements.txt
126+
127+
- name: Run behavioral test for ${{ matrix.skill }}
128+
working-directory: eval/behavioral
129+
env:
130+
# The CLI authenticates from this key; it is only present on labeled
131+
# PRs and manual dispatch (see the `discover` gate above).
132+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
133+
# Lets the harness default to this skill if a test relies on the env.
134+
BEHAVIORAL_SKILL: ${{ matrix.skill }}
135+
run: |
136+
set -euo pipefail
137+
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
138+
echo "Running $test_file"
139+
pytest "$test_file"
140+
141+
# Single aggregate gate. Mark THIS check required in branch protection.
142+
# It passes when behavioral tests were not requested (no label) and when
143+
# every selected behavioral job succeeded, so it never blocks an unlabeled PR.
144+
behavioral-gate:
145+
name: behavioral
146+
needs: [discover, behavioral]
147+
if: always()
148+
runs-on: ubuntu-latest
149+
steps:
150+
- name: Verify behavioral results
151+
run: |
152+
discover_result="${{ needs.discover.result }}"
153+
behavioral_result="${{ needs.behavioral.result }}"
154+
echo "discover: $discover_result"
155+
echo "behavioral: $behavioral_result"
156+
157+
# Label absent (or dispatch skipped): behavioral tests were not
158+
# requested for this run, so the gate is a no-op pass.
159+
if [ "$discover_result" = "skipped" ]; then
160+
echo "Behavioral tests not requested (no '${BEHAVIORAL_LABEL}' label)."
161+
exit 0
162+
fi
163+
164+
# Nothing matched the change set, or everything that ran passed.
165+
if [ "$behavioral_result" = "success" ] || [ "$behavioral_result" = "skipped" ]; then
166+
echo "All requested behavioral tests passed."
167+
exit 0
168+
fi
169+
170+
echo "One or more behavioral tests failed." >&2
171+
exit 1

CONTRIBUTING.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,34 @@ Test the skill the way users will hit it:
196196
2. Run the skill end-to-end on a real machine. Watch where the agent hesitates, asks unnecessary questions, or goes off-script.
197197
3. Bring those observations back into the skill, usually as a sharper description, a clearer default, or a missing prerequisite, rather than adding more prose.
198198

199+
## Behavioral tests
200+
201+
Structural validation proves a skill is *well-formed*; behavioral tests prove it
202+
*works*. A behavioral test runs a real agent against the skill once and grades
203+
what the agent did — see the harness in [`eval/behavioral/`](eval/behavioral/).
204+
205+
Conventions:
206+
207+
- **One file per skill, centralized.** Put the test at
208+
`eval/behavioral/tests/test_<skill>.py`, swapping the skill name's hyphens for
209+
underscores (`local-ai-use``test_local_ai_use.py`). Tests live here, not
210+
inside `skills/<name>/`, because the harness copies the skill folder into the
211+
agent's sandbox at runtime — test files in there would pollute the workspace.
212+
- **Write checks against behavior.** Combine deterministic assertions
213+
(`logs_contains`, `workspace_contains`) with LLM-judged expectations
214+
(`should`, `should_not`). See `test_local_ai_use.py` for the pattern.
215+
216+
Run one locally (needs the `claude` CLI authenticated and any per-skill
217+
prerequisites, e.g. a reachable Lemonade Server for `local-ai-use`):
218+
219+
```bash
220+
pip install -r eval/behavioral/requirements.txt
221+
cd eval/behavioral && pytest tests/test_local_ai_use.py
222+
```
223+
224+
In CI, the `behavioral` workflow runs these tests, but **only** when a
225+
maintainer adds the `run-behavioral` label to a PR for safety.
226+
199227
## Pre-publish checklist
200228

201229
- [ ] Description states the user's goal and includes likely trigger phrases

0 commit comments

Comments
 (0)