Add behavioral tests to github workflows #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: behavioral | |
| # Behavioral tests run a real agent against a skill and grade what it did (see | |
| # eval/behavioral/). They cost real API tokens and, for some skills, install | |
| # and exercise local models, so they are NOT part of the always-on PR gate. | |
| # Instead they are: | |
| # | |
| # * label-gated on PRs -- a maintainer adds the `run-behavioral` label to opt | |
| # a PR in. This keeps the ANTHROPIC_API_KEY secret away from untrusted / | |
| # fork code, which runs with tool permissions bypassed. | |
| # * selective -- only the skills whose folder or test changed are run (the | |
| # whole suite runs when the shared harness changes). See | |
| # .github/scripts/select_behavioral.py. | |
| # * dispatchable -- run any subset by hand from the Actions tab. | |
| # | |
| # Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so | |
| # branch protection can require just the `behavioral` check. That gate passes | |
| # (neutral) when the label is absent, so it never blocks an unlabeled PR. | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened, labeled] | |
| paths: | |
| - "skills/**" | |
| - "eval/behavioral/**" | |
| - "eval/claude_eval.py" | |
| - ".github/workflows/behavioral.yml" | |
| - ".github/scripts/select_behavioral.py" | |
| workflow_dispatch: | |
| inputs: | |
| skills: | |
| description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)." | |
| required: false | |
| default: "" | |
| concurrency: | |
| group: behavioral-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| env: | |
| BEHAVIORAL_LABEL: run-behavioral | |
| jobs: | |
| # Decide which skills to run. On PRs this is gated on the `run-behavioral` | |
| # label so the secret is never exposed to code that a maintainer hasn't | |
| # vouched for. | |
| discover: | |
| name: Select behavioral tests | |
| if: >- | |
| github.event_name == 'workflow_dispatch' || | |
| (github.event_name == 'pull_request' && | |
| contains(github.event.pull_request.labels.*.name, 'run-behavioral')) | |
| runs-on: ubuntu-latest | |
| outputs: | |
| skills: ${{ steps.select.outputs.skills }} | |
| any: ${{ steps.select.outputs.any }} | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| with: | |
| # Need the merge base so `git diff` can see what the PR changed. | |
| fetch-depth: 0 | |
| - name: Set up uv | |
| uses: astral-sh/setup-uv@v7 | |
| - name: Select skills | |
| id: select | |
| run: | | |
| set -euo pipefail | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| if [ -n "${{ github.event.inputs.skills }}" ]; then | |
| skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}") | |
| else | |
| skills=$(uv run .github/scripts/select_behavioral.py --all) | |
| fi | |
| else | |
| base="${{ github.event.pull_request.base.sha }}" | |
| head="${{ github.event.pull_request.head.sha }}" | |
| skills=$(git diff --name-only "$base" "$head" \ | |
| | uv run .github/scripts/select_behavioral.py --changed) | |
| fi | |
| echo "Selected skills: $skills" | |
| echo "skills=$skills" >> "$GITHUB_OUTPUT" | |
| if [ "$skills" = "[]" ]; then | |
| echo "any=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "any=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| behavioral: | |
| name: Behavioral (${{ matrix.skill }}) | |
| needs: discover | |
| if: needs.discover.outputs.any == 'true' | |
| runs-on: ubuntu-latest | |
| # Behavioral runs install local models and can take a while; cap it so a | |
| # hung agent or stalled model pull fails the job instead of burning minutes. | |
| timeout-minutes: 45 | |
| strategy: | |
| # One skill failing should not hide the others' results. | |
| fail-fast: false | |
| matrix: | |
| skill: ${{ fromJson(needs.discover.outputs.skills) }} | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Set up Node | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| - name: Install the claude CLI | |
| run: npm install -g @anthropic-ai/claude-code | |
| - name: Install behavioral test dependencies | |
| run: pip install -r eval/behavioral/requirements.txt | |
| - name: Run behavioral test for ${{ matrix.skill }} | |
| working-directory: eval/behavioral | |
| env: | |
| # The CLI authenticates from this key; it is only present on labeled | |
| # PRs and manual dispatch (see the `discover` gate above). | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| # Lets the harness default to this skill if a test relies on the env. | |
| BEHAVIORAL_SKILL: ${{ matrix.skill }} | |
| run: | | |
| set -euo pipefail | |
| test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py" | |
| echo "Running $test_file" | |
| pytest "$test_file" | |
| # Single aggregate gate. Mark THIS check required in branch protection. | |
| # It passes when behavioral tests were not requested (no label) and when | |
| # every selected behavioral job succeeded, so it never blocks an unlabeled PR. | |
| behavioral-gate: | |
| name: behavioral | |
| needs: [discover, behavioral] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Verify behavioral results | |
| run: | | |
| discover_result="${{ needs.discover.result }}" | |
| behavioral_result="${{ needs.behavioral.result }}" | |
| echo "discover: $discover_result" | |
| echo "behavioral: $behavioral_result" | |
| # Label absent (or dispatch skipped): behavioral tests were not | |
| # requested for this run, so the gate is a no-op pass. | |
| if [ "$discover_result" = "skipped" ]; then | |
| echo "Behavioral tests not requested (no '${BEHAVIORAL_LABEL}' label)." | |
| exit 0 | |
| fi | |
| # Nothing matched the change set, or everything that ran passed. | |
| if [ "$behavioral_result" = "success" ] || [ "$behavioral_result" = "skipped" ]; then | |
| echo "All requested behavioral tests passed." | |
| exit 0 | |
| fi | |
| echo "One or more behavioral tests failed." >&2 | |
| exit 1 |