Skip to content

Commit 3f5d595

Browse files
committed
Add correct trigger mechanism
1 parent c662296 commit 3f5d595

1 file changed

Lines changed: 61 additions & 33 deletions

File tree

.github/workflows/behavioral.yml

Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,24 @@ name: behavioral
22

33
# Behavioral tests run a real agent against a skill and grade what it did (see
44
# eval/behavioral/). They cost real API tokens and, for some skills, install
5-
# and exercise local models, so they are NOT part of the always-on PR gate.
6-
# Instead they are:
5+
# and exercise local models, so the actual test job is opt-in. The design:
76
#
8-
# * label-gated on PRs -- a maintainer adds the `run-behavioral` label to opt
9-
# a PR in. This keeps the ANTHROPIC_API_KEY secret away from untrusted /
10-
# fork code, which runs with tool permissions bypassed.
117
# * selective -- only the skills whose folder or test changed are run (the
128
# whole suite runs when the shared harness changes). See
139
# .github/scripts/select_behavioral.py.
10+
# * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
11+
# only runs on manual dispatch or when a maintainer adds the
12+
# `run-behavioral` label, keeping the secret away from untrusted / fork
13+
# code that runs with tool permissions bypassed.
14+
# * required when relevant -- when a PR changes a skill or test that maps to a
15+
# behavioral test, the `behavioral` gate FAILS until the label is added and
16+
# the tests pass. A PR that touches nothing testable passes neutrally.
1417
# * dispatchable -- run any subset by hand from the Actions tab.
1518
#
1619
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
17-
# branch protection can require just the `behavioral` check. That gate passes
18-
# (neutral) when the label is absent, so it never blocks an unlabeled PR.
20+
# branch protection can require just the `behavioral` check. `discover` is
21+
# secret-free, so it runs on every matching PR to decide whether the label is
22+
# required; only `behavioral` is gated on the label.
1923

2024
on:
2125
pull_request:
@@ -44,15 +48,12 @@ env:
4448
BEHAVIORAL_LABEL: run-behavioral
4549

4650
jobs:
47-
# Decide which skills to run. On PRs this is gated on the `run-behavioral`
48-
# label so the secret is never exposed to code that a maintainer hasn't
49-
# vouched for.
51+
# Decide which skills the change affects. This is secret-free (just git diff +
52+
# a Python mapping), so it runs on every matching PR regardless of the label;
53+
# the label only gates the test job below. Its `any` output drives whether the
54+
# label is required for this PR.
5055
discover:
5156
name: Select behavioral tests
52-
if: >-
53-
github.event_name == 'workflow_dispatch' ||
54-
(github.event_name == 'pull_request' &&
55-
contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
5657
runs-on: ubuntu-latest
5758
outputs:
5859
skills: ${{ steps.select.outputs.skills }}
@@ -94,7 +95,13 @@ jobs:
9495
behavioral:
9596
name: Behavioral (${{ matrix.skill }})
9697
needs: discover
97-
if: needs.discover.outputs.any == 'true'
98+
# Run only when something testable changed AND the run is authorized:
99+
# manual dispatch, or a maintainer added the `run-behavioral` label. This is
100+
# the gate that protects the ANTHROPIC_API_KEY secret.
101+
if: >-
102+
needs.discover.outputs.any == 'true' &&
103+
(github.event_name == 'workflow_dispatch' ||
104+
contains(github.event.pull_request.labels.*.name, 'run-behavioral'))
98105
runs-on: ubuntu-latest
99106
# Behavioral runs install local models and can take a while; cap it so a
100107
# hung agent or stalled model pull fails the job instead of burning minutes.
@@ -127,8 +134,8 @@ jobs:
127134
- name: Run behavioral test for ${{ matrix.skill }}
128135
working-directory: eval/behavioral
129136
env:
130-
# The CLI authenticates from this key; it is only present on labeled
131-
# PRs and manual dispatch (see the `discover` gate above).
137+
# The CLI authenticates from this key. This job only runs on labeled
138+
# PRs and manual dispatch (see this job's `if:` above).
132139
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
133140
# Lets the harness default to this skill if a test relies on the env.
134141
BEHAVIORAL_SKILL: ${{ matrix.skill }}
@@ -139,33 +146,54 @@ jobs:
139146
pytest "$test_file"
140147
141148
# Single aggregate gate. Mark THIS check required in branch protection.
142-
# It passes when behavioral tests were not requested (no label) and when
143-
# every selected behavioral job succeeded, so it never blocks an unlabeled PR.
149+
#
150+
# * nothing testable changed -> pass (neutral).
151+
# * testable change, label missing -> FAIL, asking for the label.
152+
# * testable change, authorized -> pass iff the behavioral job passed.
144153
behavioral-gate:
145154
name: behavioral
146155
needs: [discover, behavioral]
147156
if: always()
148157
runs-on: ubuntu-latest
158+
env:
159+
DISCOVER_RESULT: ${{ needs.discover.result }}
160+
BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
161+
AFFECTED: ${{ needs.discover.outputs.any }}
162+
SKILLS: ${{ needs.discover.outputs.skills }}
163+
# 'true' only on a PR that carries the label; '' / 'false' otherwise.
164+
LABEL_PRESENT: ${{ contains(github.event.pull_request.labels.*.name, 'run-behavioral') }}
149165
steps:
150166
- name: Verify behavioral results
151167
run: |
152-
discover_result="${{ needs.discover.result }}"
153-
behavioral_result="${{ needs.behavioral.result }}"
154-
echo "discover: $discover_result"
155-
echo "behavioral: $behavioral_result"
156-
157-
# Label absent (or dispatch skipped): behavioral tests were not
158-
# requested for this run, so the gate is a no-op pass.
159-
if [ "$discover_result" = "skipped" ]; then
160-
echo "Behavioral tests not requested (no '${BEHAVIORAL_LABEL}' label)."
161-
exit 0
168+
echo "discover: $DISCOVER_RESULT"
169+
echo "behavioral: $BEHAVIORAL_RESULT"
170+
echo "affected: $AFFECTED ($SKILLS)"
171+
echo "label: $LABEL_PRESENT"
172+
173+
# If discovery itself failed, surface that rather than guessing.
174+
if [ "$DISCOVER_RESULT" != "success" ]; then
175+
echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
176+
exit 1
162177
fi
163178
164-
# Nothing matched the change set, or everything that ran passed.
165-
if [ "$behavioral_result" = "success" ] || [ "$behavioral_result" = "skipped" ]; then
166-
echo "All requested behavioral tests passed."
179+
# No skill or behavioral test changed: nothing to gate on.
180+
if [ "$AFFECTED" != "true" ]; then
181+
echo "No behavioral tests affected by this change."
167182
exit 0
168183
fi
169184
170-
echo "One or more behavioral tests failed." >&2
185+
# Something testable changed. Manual dispatch and labeled PRs are
186+
# authorized to run the tests, so the gate reflects the test result.
187+
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
188+
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
189+
echo "All affected behavioral tests passed."
190+
exit 0
191+
fi
192+
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
193+
exit 1
194+
fi
195+
196+
# Testable change on a PR with no label: require it.
197+
echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
198+
echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
171199
exit 1

0 commit comments

Comments
 (0)