Point Behavioral tests to self-hosted runners #30

Workflow file for this run

.github/workflows/behavioral.yml at 6bbce34

	name: behavioral

	# Behavioral tests run a real agent against a skill and grade what it did (see
	# eval/behavioral/). They cost real API tokens and, for some skills, install
	# and exercise local models. The design:
	#
	# * selective -- only the skills whose folder or test changed are run (the
	# whole suite runs when the shared harness changes). See
	# .github/scripts/select_behavioral.py.
	# * required when relevant -- when a PR changes a skill or test that maps to a
	# behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
	# that touches nothing testable passes neutrally.
	# * dispatchable -- run any subset by hand from the Actions tab.
	#
	# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
	# branch protection can require just the `behavioral` check.

	on:
	pull_request:
	types: [opened, synchronize, reopened]
	paths:
	- "skills/**"
	- "eval/behavioral/**"
	- "eval/claude_eval.py"
	- ".github/workflows/behavioral.yml"
	- ".github/scripts/select_behavioral.py"
	workflow_dispatch:
	inputs:
	skills:
	description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
	required: false
	default: ""

	concurrency:
	group: behavioral-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	permissions:
	contents: read

	jobs:
	# Decide which skills the change affects. This is secret-free (just git diff +
	# a Python mapping). Its `any` output drives whether the behavioral job runs
	# and whether the gate has anything to enforce for this PR.
	discover:
	name: Select behavioral tests
	runs-on: ubuntu-latest
	outputs:
	skills: ${{ steps.select.outputs.skills }}
	any: ${{ steps.select.outputs.any }}
	steps:
	- name: Check out repository
	uses: actions/checkout@v4
	with:
	# Need the merge base so `git diff` can see what the PR changed.
	fetch-depth: 0

	- name: Set up uv
	uses: astral-sh/setup-uv@v7

	- name: Select skills
	id: select
	run: \|
	set -euo pipefail
	if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
	if [ -n "${{ github.event.inputs.skills }}" ]; then
	skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
	else
	skills=$(uv run .github/scripts/select_behavioral.py --all)
	fi
	else
	base="${{ github.event.pull_request.base.sha }}"
	head="${{ github.event.pull_request.head.sha }}"
	skills=$(git diff --name-only "$base" "$head" \
	\| uv run .github/scripts/select_behavioral.py --changed)
	fi
	echo "Selected skills: $skills"
	echo "skills=$skills" >> "$GITHUB_OUTPUT"
	if [ "$skills" = "[]" ]; then
	echo "any=false" >> "$GITHUB_OUTPUT"
	else
	echo "any=true" >> "$GITHUB_OUTPUT"
	fi

	behavioral:
	name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
	needs: discover
	# Run whenever the change affects something testable.
	if: needs.discover.outputs.any == 'true'
	# Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
	# the matrix below so each skill is exercised on both platforms.
	runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
	# Behavioral runs install local models and can take a while; cap it so a
	# hung agent or stalled model pull fails the job instead of burning minutes.
	timeout-minutes: 45
	strategy:
	# One skill / OS failing should not hide the others' results.
	fail-fast: false
	matrix:
	skill: ${{ fromJson(needs.discover.outputs.skills) }}
	os: [Linux, Windows]
	env:
	ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
	ANTHROPIC_BASE_URL: https://llm-api.amd.com/Anthropic
	ANTHROPIC_CUSTOM_HEADERS: \|
	Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }}
	user: a1_ucicd
	# Lets the harness default to this skill if a test relies on the env.
	BEHAVIORAL_SKILL: ${{ matrix.skill }}
	# Cost cap: sonnet only. The harness also enforces this under CI.
	BEHAVIORAL_MODEL: sonnet
	steps:
	- name: Check out repository
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: "3.12"

	- name: Set up Node
	uses: actions/setup-node@v4
	with:
	node-version: "20"

	- name: Install the claude CLI
	run: npm install -g @anthropic-ai/claude-code

	- name: Install behavioral test dependencies
	run: pip install -r eval/behavioral/requirements.txt

	- name: Run behavioral test for ${{ matrix.skill }} (Linux)
	if: matrix.os == 'Linux'
	working-directory: eval/behavioral
	shell: bash
	run: \|
	set -euo pipefail
	test_file="tests/test_$(echo '${{ matrix.skill }}' \| tr '-' '_').py"
	echo "Running $test_file"
	pytest "$test_file"

	- name: Run behavioral test for ${{ matrix.skill }} (Windows)
	if: matrix.os == 'Windows'
	working-directory: eval/behavioral
	shell: powershell
	run: \|
	$ErrorActionPreference = "Stop"
	$skill = "${{ matrix.skill }}"
	$test_file = "tests/test_$($skill -replace '-','_').py"
	Write-Host "Running $test_file"
	pytest $test_file

	# Single aggregate gate. Mark THIS check required in branch protection.
	#
	# * nothing testable changed -> pass (neutral).
	# * testable change -> pass iff the behavioral job passed.
	behavioral-gate:
	name: behavioral
	needs: [discover, behavioral]
	if: always()
	runs-on: ubuntu-latest
	env:
	DISCOVER_RESULT: ${{ needs.discover.result }}
	BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
	AFFECTED: ${{ needs.discover.outputs.any }}
	SKILLS: ${{ needs.discover.outputs.skills }}
	steps:
	- name: Verify behavioral results
	run: \|
	echo "discover: $DISCOVER_RESULT"
	echo "behavioral: $BEHAVIORAL_RESULT"
	echo "affected: $AFFECTED ($SKILLS)"

	# If discovery itself failed, surface that rather than guessing.
	if [ "$DISCOVER_RESULT" != "success" ]; then
	echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
	exit 1
	fi

	# No skill or behavioral test changed: nothing to gate on.
	if [ "$AFFECTED" != "true" ]; then
	echo "No behavioral tests affected by this change."
	exit 0
	fi

	# Something testable changed: the gate reflects the test result.
	if [ "$BEHAVIORAL_RESULT" = "success" ]; then
	echo "All affected behavioral tests passed."
	exit 0
	fi
	echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
	exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Point Behavioral tests to self-hosted runners #30

Workflow file

Point Behavioral tests to self-hosted runners #30

Uh oh!

Workflow file for this run