Skip to content

Point Behavioral tests to self-hosted runners #30

Point Behavioral tests to self-hosted runners

Point Behavioral tests to self-hosted runners #30

Workflow file for this run

name: behavioral
# Behavioral tests run a real agent against a skill and grade what it did (see
# eval/behavioral/). They cost real API tokens and, for some skills, install
# and exercise local models. The design:
#
# * selective -- only the skills whose folder or test changed are run (the
# whole suite runs when the shared harness changes). See
# .github/scripts/select_behavioral.py.
# * required when relevant -- when a PR changes a skill or test that maps to a
# behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
# that touches nothing testable passes neutrally.
# * dispatchable -- run any subset by hand from the Actions tab.
#
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
# branch protection can require just the `behavioral` check.
on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- "skills/**"
- "eval/behavioral/**"
- "eval/claude_eval.py"
- ".github/workflows/behavioral.yml"
- ".github/scripts/select_behavioral.py"
workflow_dispatch:
inputs:
skills:
description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
required: false
default: ""
concurrency:
group: behavioral-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
# Decide which skills the change affects. This is secret-free (just git diff +
# a Python mapping). Its `any` output drives whether the behavioral job runs
# and whether the gate has anything to enforce for this PR.
discover:
name: Select behavioral tests
runs-on: ubuntu-latest
outputs:
skills: ${{ steps.select.outputs.skills }}
any: ${{ steps.select.outputs.any }}
steps:
- name: Check out repository
uses: actions/checkout@v4
with:
# Need the merge base so `git diff` can see what the PR changed.
fetch-depth: 0
- name: Set up uv
uses: astral-sh/setup-uv@v7
- name: Select skills
id: select
run: |
set -euo pipefail
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
if [ -n "${{ github.event.inputs.skills }}" ]; then
skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
else
skills=$(uv run .github/scripts/select_behavioral.py --all)
fi
else
base="${{ github.event.pull_request.base.sha }}"
head="${{ github.event.pull_request.head.sha }}"
skills=$(git diff --name-only "$base" "$head" \
| uv run .github/scripts/select_behavioral.py --changed)
fi
echo "Selected skills: $skills"
echo "skills=$skills" >> "$GITHUB_OUTPUT"
if [ "$skills" = "[]" ]; then
echo "any=false" >> "$GITHUB_OUTPUT"
else
echo "any=true" >> "$GITHUB_OUTPUT"
fi
behavioral:
name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
needs: discover
# Run whenever the change affects something testable.
if: needs.discover.outputs.any == 'true'
# Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
# the matrix below so each skill is exercised on both platforms.
runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
# Behavioral runs install local models and can take a while; cap it so a
# hung agent or stalled model pull fails the job instead of burning minutes.
timeout-minutes: 45
strategy:
# One skill / OS failing should not hide the others' results.
fail-fast: false
matrix:
skill: ${{ fromJson(needs.discover.outputs.skills) }}
os: [Linux, Windows]
env:
ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
ANTHROPIC_BASE_URL: https://llm-api.amd.com/Anthropic
ANTHROPIC_CUSTOM_HEADERS: |
Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }}
user: a1_ucicd
# Lets the harness default to this skill if a test relies on the env.
BEHAVIORAL_SKILL: ${{ matrix.skill }}
# Cost cap: sonnet only. The harness also enforces this under CI.
BEHAVIORAL_MODEL: sonnet
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Set up Node
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install the claude CLI
run: npm install -g @anthropic-ai/claude-code
- name: Install behavioral test dependencies
run: pip install -r eval/behavioral/requirements.txt
- name: Run behavioral test for ${{ matrix.skill }} (Linux)
if: matrix.os == 'Linux'
working-directory: eval/behavioral
shell: bash
run: |
set -euo pipefail
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
echo "Running $test_file"
pytest "$test_file"
- name: Run behavioral test for ${{ matrix.skill }} (Windows)
if: matrix.os == 'Windows'
working-directory: eval/behavioral
shell: powershell
run: |
$ErrorActionPreference = "Stop"
$skill = "${{ matrix.skill }}"
$test_file = "tests/test_$($skill -replace '-','_').py"
Write-Host "Running $test_file"
pytest $test_file
# Single aggregate gate. Mark THIS check required in branch protection.
#
# * nothing testable changed -> pass (neutral).
# * testable change -> pass iff the behavioral job passed.
behavioral-gate:
name: behavioral
needs: [discover, behavioral]
if: always()
runs-on: ubuntu-latest
env:
DISCOVER_RESULT: ${{ needs.discover.result }}
BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
AFFECTED: ${{ needs.discover.outputs.any }}
SKILLS: ${{ needs.discover.outputs.skills }}
steps:
- name: Verify behavioral results
run: |
echo "discover: $DISCOVER_RESULT"
echo "behavioral: $BEHAVIORAL_RESULT"
echo "affected: $AFFECTED ($SKILLS)"
# If discovery itself failed, surface that rather than guessing.
if [ "$DISCOVER_RESULT" != "success" ]; then
echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
exit 1
fi
# No skill or behavioral test changed: nothing to gate on.
if [ "$AFFECTED" != "true" ]; then
echo "No behavioral tests affected by this change."
exit 0
fi
# Something testable changed: the gate reflects the test result.
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
echo "All affected behavioral tests passed."
exit 0
fi
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
exit 1