@@ -2,28 +2,22 @@ name: behavioral
22
33# Behavioral tests run a real agent against a skill and grade what it did (see
44# eval/behavioral/). They cost real API tokens and, for some skills, install
5- # and exercise local models, so the actual test job is opt-in . The design:
5+ # and exercise local models. The design:
66#
77# * selective -- only the skills whose folder or test changed are run (the
88# whole suite runs when the shared harness changes). See
99# .github/scripts/select_behavioral.py.
10- # * label-gated execution -- the test job (which holds ORCHESTR_API_KEY)
11- # only runs on manual dispatch or when a maintainer adds the
12- # `run_behavioral` label, keeping the secret away from untrusted / fork
13- # code that runs with tool permissions bypassed.
1410# * required when relevant -- when a PR changes a skill or test that maps to a
15- # behavioral test, the `behavioral` gate FAILS until the label is added and
16- # the tests pass. A PR that touches nothing testable passes neutrally.
11+ # behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
12+ # that touches nothing testable passes neutrally.
1713# * dispatchable -- run any subset by hand from the Actions tab.
1814#
1915# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
20- # branch protection can require just the `behavioral` check. `discover` is
21- # secret-free, so it runs on every matching PR to decide whether the label is
22- # required; only `behavioral` is gated on the label.
16+ # branch protection can require just the `behavioral` check.
2317
2418on :
2519 pull_request :
26- types : [opened, synchronize, reopened, labeled ]
20+ types : [opened, synchronize, reopened]
2721 paths :
2822 - " skills/**"
2923 - " eval/behavioral/**"
@@ -44,14 +38,10 @@ concurrency:
4438permissions :
4539 contents : read
4640
47- env :
48- BEHAVIORAL_LABEL : run_behavioral
49-
5041jobs :
5142 # Decide which skills the change affects. This is secret-free (just git diff +
52- # a Python mapping), so it runs on every matching PR regardless of the label;
53- # the label only gates the test job below. Its `any` output drives whether the
54- # label is required for this PR.
43+ # a Python mapping). Its `any` output drives whether the behavioral job runs
44+ # and whether the gate has anything to enforce for this PR.
5545 discover :
5646 name : Select behavioral tests
5747 runs-on : ubuntu-latest
9585 behavioral :
9686 name : Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
9787 needs : discover
98- # Run only when something testable changed AND the run is authorized:
99- # manual dispatch, or a maintainer added the `run_behavioral` label. This is
100- # the gate that protects the ORCHESTR_API_KEY secret.
101- if : >-
102- needs.discover.outputs.any == 'true' &&
103- (github.event_name == 'workflow_dispatch' ||
104- contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
88+ # Run whenever the change affects something testable.
89+ if : needs.discover.outputs.any == 'true'
10590 # Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
10691 # the matrix below so each skill is exercised on both platforms.
10792 runs-on : [self-hosted, strix_halo, "${{ matrix.os }}"]
@@ -116,8 +101,7 @@ jobs:
116101 os : [Linux, Windows]
117102 env :
118103 # The CLI authenticates from this key and targets AMD's internal LLM
119- # gateway. This job only runs on labeled PRs and manual dispatch (see this
120- # job's `if:` above), keeping the secret away from untrusted / fork code.
104+ # gateway.
121105 ANTHROPIC_API_KEY : ${{ secrets.ORCHESTR_API_KEY }}
122106 ANTHROPIC_BASE_URL : https://llm-api.amd.com
123107 # The gateway identifies the calling user via a custom header.
@@ -169,9 +153,8 @@ jobs:
169153
170154 # Single aggregate gate. Mark THIS check required in branch protection.
171155 #
172- # * nothing testable changed -> pass (neutral).
173- # * testable change, label missing -> FAIL, asking for the label.
174- # * testable change, authorized -> pass iff the behavioral job passed.
156+ # * nothing testable changed -> pass (neutral).
157+ # * testable change -> pass iff the behavioral job passed.
175158 behavioral-gate :
176159 name : behavioral
177160 needs : [discover, behavioral]
@@ -182,15 +165,12 @@ jobs:
182165 BEHAVIORAL_RESULT : ${{ needs.behavioral.result }}
183166 AFFECTED : ${{ needs.discover.outputs.any }}
184167 SKILLS : ${{ needs.discover.outputs.skills }}
185- # 'true' only on a PR that carries the label; '' / 'false' otherwise.
186- LABEL_PRESENT : ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }}
187168 steps :
188169 - name : Verify behavioral results
189170 run : |
190171 echo "discover: $DISCOVER_RESULT"
191172 echo "behavioral: $BEHAVIORAL_RESULT"
192173 echo "affected: $AFFECTED ($SKILLS)"
193- echo "label: $LABEL_PRESENT"
194174
195175 # If discovery itself failed, surface that rather than guessing.
196176 if [ "$DISCOVER_RESULT" != "success" ]; then
@@ -204,18 +184,10 @@ jobs:
204184 exit 0
205185 fi
206186
207- # Something testable changed. Manual dispatch and labeled PRs are
208- # authorized to run the tests, so the gate reflects the test result.
209- if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
210- if [ "$BEHAVIORAL_RESULT" = "success" ]; then
211- echo "All affected behavioral tests passed."
212- exit 0
213- fi
214- echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
215- exit 1
187+ # Something testable changed: the gate reflects the test result.
188+ if [ "$BEHAVIORAL_RESULT" = "success" ]; then
189+ echo "All affected behavioral tests passed."
190+ exit 0
216191 fi
217-
218- # Testable change on a PR with no label: require it.
219- echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
220- echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
192+ echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
221193 exit 1
0 commit comments