@@ -2,28 +2,22 @@ name: behavioral
22
33# Behavioral tests run a real agent against a skill and grade what it did (see
44# eval/behavioral/). They cost real API tokens and, for some skills, install
5- # and exercise local models, so the actual test job is opt-in . The design:
5+ # and exercise local models. The design:
66#
77# * selective -- only the skills whose folder or test changed are run (the
88# whole suite runs when the shared harness changes). See
99# .github/scripts/select_behavioral.py.
10- # * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
11- # only runs on manual dispatch or when a maintainer adds the
12- # `run_behavioral` label, keeping the secret away from untrusted / fork
13- # code that runs with tool permissions bypassed.
1410# * required when relevant -- when a PR changes a skill or test that maps to a
15- # behavioral test, the `behavioral` gate FAILS until the label is added and
16- # the tests pass. A PR that touches nothing testable passes neutrally.
11+ # behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
12+ # that touches nothing testable passes neutrally.
1713# * dispatchable -- run any subset by hand from the Actions tab.
1814#
1915# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
20- # branch protection can require just the `behavioral` check. `discover` is
21- # secret-free, so it runs on every matching PR to decide whether the label is
22- # required; only `behavioral` is gated on the label.
16+ # branch protection can require just the `behavioral` check.
2317
2418on :
2519 pull_request :
26- types : [opened, synchronize, reopened, labeled ]
20+ types : [opened, synchronize, reopened]
2721 paths :
2822 - " skills/**"
2923 - " eval/behavioral/**"
@@ -44,14 +38,10 @@ concurrency:
4438permissions :
4539 contents : read
4640
47- env :
48- BEHAVIORAL_LABEL : run_behavioral
49-
5041jobs :
5142 # Decide which skills the change affects. This is secret-free (just git diff +
52- # a Python mapping), so it runs on every matching PR regardless of the label;
53- # the label only gates the test job below. Its `any` output drives whether the
54- # label is required for this PR.
43+ # a Python mapping). Its `any` output drives whether the behavioral job runs
44+ # and whether the gate has anything to enforce for this PR.
5545 discover :
5646 name : Select behavioral tests
5747 runs-on : ubuntu-latest
@@ -93,24 +83,32 @@ jobs:
9383 fi
9484
9585 behavioral :
96- name : Behavioral (${{ matrix.skill }})
86+ name : Behavioral (${{ matrix.skill }} on ${{ matrix.os }} )
9787 needs : discover
98- # Run only when something testable changed AND the run is authorized:
99- # manual dispatch, or a maintainer added the `run_behavioral` label. This is
100- # the gate that protects the ANTHROPIC_API_KEY secret.
101- if : >-
102- needs.discover.outputs.any == 'true' &&
103- (github.event_name == 'workflow_dispatch' ||
104- contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
105- runs-on : ubuntu-latest
88+ # Run whenever the change affects something testable.
89+ if : needs.discover.outputs.any == 'true'
90+ # Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
91+ # the matrix below so each skill is exercised on both platforms.
92+ runs-on : [self-hosted, strix_halo, "${{ matrix.os }}"]
10693 # Behavioral runs install local models and can take a while; cap it so a
10794 # hung agent or stalled model pull fails the job instead of burning minutes.
10895 timeout-minutes : 45
10996 strategy :
110- # One skill failing should not hide the others' results.
97+ # One skill / OS failing should not hide the others' results.
11198 fail-fast : false
11299 matrix :
113100 skill : ${{ fromJson(needs.discover.outputs.skills) }}
101+ os : [Linux, Windows]
102+ env :
103+ ANTHROPIC_API_KEY : ${{ secrets.ORCHESTR_API_KEY }}
104+ ANTHROPIC_BASE_URL : https://llm-api.amd.com/Anthropic
105+ ANTHROPIC_CUSTOM_HEADERS : |
106+ Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }}
107+ user: a1_ucicd
108+ # Lets the harness default to this skill if a test relies on the env.
109+ BEHAVIORAL_SKILL : ${{ matrix.skill }}
110+ # Cost cap: sonnet only. The harness also enforces this under CI.
111+ BEHAVIORAL_MODEL : sonnet
114112 steps :
115113 - name : Check out repository
116114 uses : actions/checkout@v4
@@ -131,27 +129,31 @@ jobs:
131129 - name : Install behavioral test dependencies
132130 run : pip install -r eval/behavioral/requirements.txt
133131
134- - name : Run behavioral test for ${{ matrix.skill }}
132+ - name : Run behavioral test for ${{ matrix.skill }} (Linux)
133+ if : matrix.os == 'Linux'
135134 working-directory : eval/behavioral
136- env :
137- # The CLI authenticates from this key. This job only runs on labeled
138- # PRs and manual dispatch (see this job's `if:` above).
139- ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
140- # Lets the harness default to this skill if a test relies on the env.
141- BEHAVIORAL_SKILL : ${{ matrix.skill }}
142- # Cost cap: sonnet only. The harness also enforces this under CI.
143- BEHAVIORAL_MODEL : sonnet
135+ shell : bash
144136 run : |
145137 set -euo pipefail
146138 test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
147139 echo "Running $test_file"
148140 pytest "$test_file"
149141
142+ - name : Run behavioral test for ${{ matrix.skill }} (Windows)
143+ if : matrix.os == 'Windows'
144+ working-directory : eval/behavioral
145+ shell : powershell
146+ run : |
147+ $ErrorActionPreference = "Stop"
148+ $skill = "${{ matrix.skill }}"
149+ $test_file = "tests/test_$($skill -replace '-','_').py"
150+ Write-Host "Running $test_file"
151+ pytest $test_file
152+
150153 # Single aggregate gate. Mark THIS check required in branch protection.
151154 #
152- # * nothing testable changed -> pass (neutral).
153- # * testable change, label missing -> FAIL, asking for the label.
154- # * testable change, authorized -> pass iff the behavioral job passed.
155+ # * nothing testable changed -> pass (neutral).
156+ # * testable change -> pass iff the behavioral job passed.
155157 behavioral-gate :
156158 name : behavioral
157159 needs : [discover, behavioral]
@@ -162,15 +164,12 @@ jobs:
162164 BEHAVIORAL_RESULT : ${{ needs.behavioral.result }}
163165 AFFECTED : ${{ needs.discover.outputs.any }}
164166 SKILLS : ${{ needs.discover.outputs.skills }}
165- # 'true' only on a PR that carries the label; '' / 'false' otherwise.
166- LABEL_PRESENT : ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }}
167167 steps :
168168 - name : Verify behavioral results
169169 run : |
170170 echo "discover: $DISCOVER_RESULT"
171171 echo "behavioral: $BEHAVIORAL_RESULT"
172172 echo "affected: $AFFECTED ($SKILLS)"
173- echo "label: $LABEL_PRESENT"
174173
175174 # If discovery itself failed, surface that rather than guessing.
176175 if [ "$DISCOVER_RESULT" != "success" ]; then
@@ -184,18 +183,10 @@ jobs:
184183 exit 0
185184 fi
186185
187- # Something testable changed. Manual dispatch and labeled PRs are
188- # authorized to run the tests, so the gate reflects the test result.
189- if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
190- if [ "$BEHAVIORAL_RESULT" = "success" ]; then
191- echo "All affected behavioral tests passed."
192- exit 0
193- fi
194- echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
195- exit 1
186+ # Something testable changed: the gate reflects the test result.
187+ if [ "$BEHAVIORAL_RESULT" = "success" ]; then
188+ echo "All affected behavioral tests passed."
189+ exit 0
196190 fi
197-
198- # Testable change on a PR with no label: require it.
199- echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
200- echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
191+ echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
201192 exit 1
0 commit comments