@@ -2,20 +2,24 @@ name: behavioral
22
33# Behavioral tests run a real agent against a skill and grade what it did (see
44# eval/behavioral/). They cost real API tokens and, for some skills, install
5- # and exercise local models, so they are NOT part of the always-on PR gate.
6- # Instead they are:
5+ # and exercise local models, so the actual test job is opt-in. The design:
76#
8- # * label-gated on PRs -- a maintainer adds the `run-behavioral` label to opt
9- # a PR in. This keeps the ANTHROPIC_API_KEY secret away from untrusted /
10- # fork code, which runs with tool permissions bypassed.
117# * selective -- only the skills whose folder or test changed are run (the
128# whole suite runs when the shared harness changes). See
139# .github/scripts/select_behavioral.py.
10+ # * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
11+ # only runs on manual dispatch or when a maintainer adds the
12+ # `run-behavioral` label, keeping the secret away from untrusted / fork
13+ # code that runs with tool permissions bypassed.
14+ # * required when relevant -- when a PR changes a skill or test that maps to a
15+ # behavioral test, the `behavioral` gate FAILS until the label is added and
16+ # the tests pass. A PR that touches nothing testable passes neutrally.
1417# * dispatchable -- run any subset by hand from the Actions tab.
1518#
1619# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
17- # branch protection can require just the `behavioral` check. That gate passes
18- # (neutral) when the label is absent, so it never blocks an unlabeled PR.
20+ # branch protection can require just the `behavioral` check. `discover` is
21+ # secret-free, so it runs on every matching PR to decide whether the label is
22+ # required; only `behavioral` is gated on the label.
1923
2024on :
2125 pull_request :
4448 BEHAVIORAL_LABEL : run-behavioral
4549
4650jobs :
47- # Decide which skills to run. On PRs this is gated on the `run-behavioral`
48- # label so the secret is never exposed to code that a maintainer hasn't
49- # vouched for.
51+ # Decide which skills the change affects. This is secret-free (just git diff +
52+ # a Python mapping), so it runs on every matching PR regardless of the label;
53+ # the label only gates the test job below. Its `any` output drives whether the
54+ # label is required for this PR.
5055 discover :
5156 name : Select behavioral tests
52- if : >-
53- github.event_name == 'workflow_dispatch' ||
54- (github.event_name == 'pull_request' &&
55- contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
5657 runs-on : ubuntu-latest
5758 outputs :
5859 skills : ${{ steps.select.outputs.skills }}
9495 behavioral :
9596 name : Behavioral (${{ matrix.skill }})
9697 needs : discover
97- if : needs.discover.outputs.any == 'true'
98+ # Run only when something testable changed AND the run is authorized:
99+ # manual dispatch, or a maintainer added the `run-behavioral` label. This is
100+ # the gate that protects the ANTHROPIC_API_KEY secret.
101+ if : >-
102+ needs.discover.outputs.any == 'true' &&
103+ (github.event_name == 'workflow_dispatch' ||
104+ contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
98105 runs-on : ubuntu-latest
99106 # Behavioral runs install local models and can take a while; cap it so a
100107 # hung agent or stalled model pull fails the job instead of burning minutes.
@@ -127,8 +134,8 @@ jobs:
127134 - name : Run behavioral test for ${{ matrix.skill }}
128135 working-directory : eval/behavioral
129136 env :
130- # The CLI authenticates from this key; it is only present on labeled
131- # PRs and manual dispatch (see the `discover` gate above).
137+ # The CLI authenticates from this key. This job only runs on labeled
138+ # PRs and manual dispatch (see this job's `if:` above).
132139 ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
133140 # Lets the harness default to this skill if a test relies on the env.
134141 BEHAVIORAL_SKILL : ${{ matrix.skill }}
@@ -139,33 +146,54 @@ jobs:
139146 pytest "$test_file"
140147
141148 # Single aggregate gate. Mark THIS check required in branch protection.
142- # It passes when behavioral tests were not requested (no label) and when
143- # every selected behavioral job succeeded, so it never blocks an unlabeled PR.
149+ #
150+ # * nothing testable changed -> pass (neutral).
151+ # * testable change, label missing -> FAIL, asking for the label.
152+ # * testable change, authorized -> pass iff the behavioral job passed.
144153 behavioral-gate :
145154 name : behavioral
146155 needs : [discover, behavioral]
147156 if : always()
148157 runs-on : ubuntu-latest
158+ env :
159+ DISCOVER_RESULT : ${{ needs.discover.result }}
160+ BEHAVIORAL_RESULT : ${{ needs.behavioral.result }}
161+ AFFECTED : ${{ needs.discover.outputs.any }}
162+ SKILLS : ${{ needs.discover.outputs.skills }}
163+ # 'true' only on a PR that carries the label; '' / 'false' otherwise.
164+ LABEL_PRESENT : ${{ contains(github.event.pull_request.labels.*.name, 'run-behavioral') }}
149165 steps :
150166 - name : Verify behavioral results
151167 run : |
152- discover_result="${{ needs.discover.result }}"
153- behavioral_result="${{ needs.behavioral.result }}"
154- echo "discover: $discover_result"
155- echo "behavioral: $behavioral_result"
156-
157- # Label absent (or dispatch skipped): behavioral tests were not
158- # requested for this run, so the gate is a no-op pass.
159- if [ "$discover_result" = "skipped" ]; then
160- echo "Behavioral tests not requested (no '${BEHAVIORAL_LABEL}' label)."
161- exit 0
168+ echo "discover: $DISCOVER_RESULT"
169+ echo "behavioral: $BEHAVIORAL_RESULT"
170+ echo "affected: $AFFECTED ($SKILLS)"
171+ echo "label: $LABEL_PRESENT"
172+
173+ # If discovery itself failed, surface that rather than guessing.
174+ if [ "$DISCOVER_RESULT" != "success" ]; then
175+ echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
176+ exit 1
162177 fi
163178
164- # Nothing matched the change set, or everything that ran passed .
165- if [ "$behavioral_result" = "success" ] || [ "$behavioral_result" = "skipped " ]; then
166- echo "All requested behavioral tests passed ."
179+ # No skill or behavioral test changed: nothing to gate on .
180+ if [ "$AFFECTED" ! = "true " ]; then
181+ echo "No behavioral tests affected by this change ."
167182 exit 0
168183 fi
169184
170- echo "One or more behavioral tests failed." >&2
185+ # Something testable changed. Manual dispatch and labeled PRs are
186+ # authorized to run the tests, so the gate reflects the test result.
187+ if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
188+ if [ "$BEHAVIORAL_RESULT" = "success" ]; then
189+ echo "All affected behavioral tests passed."
190+ exit 0
191+ fi
192+ echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
193+ exit 1
194+ fi
195+
196+ # Testable change on a PR with no label: require it.
197+ echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
198+ echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
171199 exit 1
0 commit comments