Skip to content

Commit 05324f2

Browse files
committed
Drop label requirement
1 parent 1d95b72 commit 05324f2

1 file changed

Lines changed: 17 additions & 45 deletions

File tree

.github/workflows/behavioral.yml

Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,22 @@ name: behavioral
22

33
# Behavioral tests run a real agent against a skill and grade what it did (see
44
# eval/behavioral/). They cost real API tokens and, for some skills, install
5-
# and exercise local models, so the actual test job is opt-in. The design:
5+
# and exercise local models. The design:
66
#
77
# * selective -- only the skills whose folder or test changed are run (the
88
# whole suite runs when the shared harness changes). See
99
# .github/scripts/select_behavioral.py.
10-
# * label-gated execution -- the test job (which holds ORCHESTR_API_KEY)
11-
# only runs on manual dispatch or when a maintainer adds the
12-
# `run_behavioral` label, keeping the secret away from untrusted / fork
13-
# code that runs with tool permissions bypassed.
1410
# * required when relevant -- when a PR changes a skill or test that maps to a
15-
# behavioral test, the `behavioral` gate FAILS until the label is added and
16-
# the tests pass. A PR that touches nothing testable passes neutrally.
11+
# behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
12+
# that touches nothing testable passes neutrally.
1713
# * dispatchable -- run any subset by hand from the Actions tab.
1814
#
1915
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
20-
# branch protection can require just the `behavioral` check. `discover` is
21-
# secret-free, so it runs on every matching PR to decide whether the label is
22-
# required; only `behavioral` is gated on the label.
16+
# branch protection can require just the `behavioral` check.
2317

2418
on:
2519
pull_request:
26-
types: [opened, synchronize, reopened, labeled]
20+
types: [opened, synchronize, reopened]
2721
paths:
2822
- "skills/**"
2923
- "eval/behavioral/**"
@@ -44,14 +38,10 @@ concurrency:
4438
permissions:
4539
contents: read
4640

47-
env:
48-
BEHAVIORAL_LABEL: run_behavioral
49-
5041
jobs:
5142
# Decide which skills the change affects. This is secret-free (just git diff +
52-
# a Python mapping), so it runs on every matching PR regardless of the label;
53-
# the label only gates the test job below. Its `any` output drives whether the
54-
# label is required for this PR.
43+
# a Python mapping). Its `any` output drives whether the behavioral job runs
44+
# and whether the gate has anything to enforce for this PR.
5545
discover:
5646
name: Select behavioral tests
5747
runs-on: ubuntu-latest
@@ -95,13 +85,8 @@ jobs:
9585
behavioral:
9686
name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
9787
needs: discover
98-
# Run only when something testable changed AND the run is authorized:
99-
# manual dispatch, or a maintainer added the `run_behavioral` label. This is
100-
# the gate that protects the ORCHESTR_API_KEY secret.
101-
if: >-
102-
needs.discover.outputs.any == 'true' &&
103-
(github.event_name == 'workflow_dispatch' ||
104-
contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
88+
# Run whenever the change affects something testable.
89+
if: needs.discover.outputs.any == 'true'
10590
# Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
10691
# the matrix below so each skill is exercised on both platforms.
10792
runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
@@ -116,8 +101,7 @@ jobs:
116101
os: [Linux, Windows]
117102
env:
118103
# The CLI authenticates from this key and targets AMD's internal LLM
119-
# gateway. This job only runs on labeled PRs and manual dispatch (see this
120-
# job's `if:` above), keeping the secret away from untrusted / fork code.
104+
# gateway.
121105
ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
122106
ANTHROPIC_BASE_URL: https://llm-api.amd.com
123107
# The gateway identifies the calling user via a custom header.
@@ -169,9 +153,8 @@ jobs:
169153
170154
# Single aggregate gate. Mark THIS check required in branch protection.
171155
#
172-
# * nothing testable changed -> pass (neutral).
173-
# * testable change, label missing -> FAIL, asking for the label.
174-
# * testable change, authorized -> pass iff the behavioral job passed.
156+
# * nothing testable changed -> pass (neutral).
157+
# * testable change -> pass iff the behavioral job passed.
175158
behavioral-gate:
176159
name: behavioral
177160
needs: [discover, behavioral]
@@ -182,15 +165,12 @@ jobs:
182165
BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
183166
AFFECTED: ${{ needs.discover.outputs.any }}
184167
SKILLS: ${{ needs.discover.outputs.skills }}
185-
# 'true' only on a PR that carries the label; '' / 'false' otherwise.
186-
LABEL_PRESENT: ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }}
187168
steps:
188169
- name: Verify behavioral results
189170
run: |
190171
echo "discover: $DISCOVER_RESULT"
191172
echo "behavioral: $BEHAVIORAL_RESULT"
192173
echo "affected: $AFFECTED ($SKILLS)"
193-
echo "label: $LABEL_PRESENT"
194174
195175
# If discovery itself failed, surface that rather than guessing.
196176
if [ "$DISCOVER_RESULT" != "success" ]; then
@@ -204,18 +184,10 @@ jobs:
204184
exit 0
205185
fi
206186
207-
# Something testable changed. Manual dispatch and labeled PRs are
208-
# authorized to run the tests, so the gate reflects the test result.
209-
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
210-
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
211-
echo "All affected behavioral tests passed."
212-
exit 0
213-
fi
214-
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
215-
exit 1
187+
# Something testable changed: the gate reflects the test result.
188+
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
189+
echo "All affected behavioral tests passed."
190+
exit 0
216191
fi
217-
218-
# Testable change on a PR with no label: require it.
219-
echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
220-
echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
192+
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
221193
exit 1

0 commit comments

Comments
 (0)