Skip to content

Commit acd5fe9

Browse files
Point Behavioral tests to self-hosted runners (#77)
1 parent 0718ef0 commit acd5fe9

2 files changed

Lines changed: 59 additions & 61 deletions

File tree

.github/workflows/behavioral.yml

Lines changed: 46 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,22 @@ name: behavioral
22

33
# Behavioral tests run a real agent against a skill and grade what it did (see
44
# eval/behavioral/). They cost real API tokens and, for some skills, install
5-
# and exercise local models, so the actual test job is opt-in. The design:
5+
# and exercise local models. The design:
66
#
77
# * selective -- only the skills whose folder or test changed are run (the
88
# whole suite runs when the shared harness changes). See
99
# .github/scripts/select_behavioral.py.
10-
# * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
11-
# only runs on manual dispatch or when a maintainer adds the
12-
# `run_behavioral` label, keeping the secret away from untrusted / fork
13-
# code that runs with tool permissions bypassed.
1410
# * required when relevant -- when a PR changes a skill or test that maps to a
15-
# behavioral test, the `behavioral` gate FAILS until the label is added and
16-
# the tests pass. A PR that touches nothing testable passes neutrally.
11+
# behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
12+
# that touches nothing testable passes neutrally.
1713
# * dispatchable -- run any subset by hand from the Actions tab.
1814
#
1915
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
20-
# branch protection can require just the `behavioral` check. `discover` is
21-
# secret-free, so it runs on every matching PR to decide whether the label is
22-
# required; only `behavioral` is gated on the label.
16+
# branch protection can require just the `behavioral` check.
2317

2418
on:
2519
pull_request:
26-
types: [opened, synchronize, reopened, labeled]
20+
types: [opened, synchronize, reopened]
2721
paths:
2822
- "skills/**"
2923
- "eval/behavioral/**"
@@ -44,14 +38,10 @@ concurrency:
4438
permissions:
4539
contents: read
4640

47-
env:
48-
BEHAVIORAL_LABEL: run_behavioral
49-
5041
jobs:
5142
# Decide which skills the change affects. This is secret-free (just git diff +
52-
# a Python mapping), so it runs on every matching PR regardless of the label;
53-
# the label only gates the test job below. Its `any` output drives whether the
54-
# label is required for this PR.
43+
# a Python mapping). Its `any` output drives whether the behavioral job runs
44+
# and whether the gate has anything to enforce for this PR.
5545
discover:
5646
name: Select behavioral tests
5747
runs-on: ubuntu-latest
@@ -93,24 +83,32 @@ jobs:
9383
fi
9484
9585
behavioral:
96-
name: Behavioral (${{ matrix.skill }})
86+
name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
9787
needs: discover
98-
# Run only when something testable changed AND the run is authorized:
99-
# manual dispatch, or a maintainer added the `run_behavioral` label. This is
100-
# the gate that protects the ANTHROPIC_API_KEY secret.
101-
if: >-
102-
needs.discover.outputs.any == 'true' &&
103-
(github.event_name == 'workflow_dispatch' ||
104-
contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
105-
runs-on: ubuntu-latest
88+
# Run whenever the change affects something testable.
89+
if: needs.discover.outputs.any == 'true'
90+
# Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
91+
# the matrix below so each skill is exercised on both platforms.
92+
runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
10693
# Behavioral runs install local models and can take a while; cap it so a
10794
# hung agent or stalled model pull fails the job instead of burning minutes.
10895
timeout-minutes: 45
10996
strategy:
110-
# One skill failing should not hide the others' results.
97+
# One skill / OS failing should not hide the others' results.
11198
fail-fast: false
11299
matrix:
113100
skill: ${{ fromJson(needs.discover.outputs.skills) }}
101+
os: [Linux, Windows]
102+
env:
103+
ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
104+
ANTHROPIC_BASE_URL: https://llm-api.amd.com/Anthropic
105+
ANTHROPIC_CUSTOM_HEADERS: |
106+
Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }}
107+
user: a1_ucicd
108+
# Lets the harness default to this skill if a test relies on the env.
109+
BEHAVIORAL_SKILL: ${{ matrix.skill }}
110+
# Cost cap: sonnet only. The harness also enforces this under CI.
111+
BEHAVIORAL_MODEL: sonnet
114112
steps:
115113
- name: Check out repository
116114
uses: actions/checkout@v4
@@ -131,27 +129,31 @@ jobs:
131129
- name: Install behavioral test dependencies
132130
run: pip install -r eval/behavioral/requirements.txt
133131

134-
- name: Run behavioral test for ${{ matrix.skill }}
132+
- name: Run behavioral test for ${{ matrix.skill }} (Linux)
133+
if: matrix.os == 'Linux'
135134
working-directory: eval/behavioral
136-
env:
137-
# The CLI authenticates from this key. This job only runs on labeled
138-
# PRs and manual dispatch (see this job's `if:` above).
139-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
140-
# Lets the harness default to this skill if a test relies on the env.
141-
BEHAVIORAL_SKILL: ${{ matrix.skill }}
142-
# Cost cap: sonnet only. The harness also enforces this under CI.
143-
BEHAVIORAL_MODEL: sonnet
135+
shell: bash
144136
run: |
145137
set -euo pipefail
146138
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
147139
echo "Running $test_file"
148140
pytest "$test_file"
149141
142+
- name: Run behavioral test for ${{ matrix.skill }} (Windows)
143+
if: matrix.os == 'Windows'
144+
working-directory: eval/behavioral
145+
shell: powershell
146+
run: |
147+
$ErrorActionPreference = "Stop"
148+
$skill = "${{ matrix.skill }}"
149+
$test_file = "tests/test_$($skill -replace '-','_').py"
150+
Write-Host "Running $test_file"
151+
pytest $test_file
152+
150153
# Single aggregate gate. Mark THIS check required in branch protection.
151154
#
152-
# * nothing testable changed -> pass (neutral).
153-
# * testable change, label missing -> FAIL, asking for the label.
154-
# * testable change, authorized -> pass iff the behavioral job passed.
155+
# * nothing testable changed -> pass (neutral).
156+
# * testable change -> pass iff the behavioral job passed.
155157
behavioral-gate:
156158
name: behavioral
157159
needs: [discover, behavioral]
@@ -162,15 +164,12 @@ jobs:
162164
BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
163165
AFFECTED: ${{ needs.discover.outputs.any }}
164166
SKILLS: ${{ needs.discover.outputs.skills }}
165-
# 'true' only on a PR that carries the label; '' / 'false' otherwise.
166-
LABEL_PRESENT: ${{ contains(github.event.pull_request.labels.*.name, 'run_behavioral') }}
167167
steps:
168168
- name: Verify behavioral results
169169
run: |
170170
echo "discover: $DISCOVER_RESULT"
171171
echo "behavioral: $BEHAVIORAL_RESULT"
172172
echo "affected: $AFFECTED ($SKILLS)"
173-
echo "label: $LABEL_PRESENT"
174173
175174
# If discovery itself failed, surface that rather than guessing.
176175
if [ "$DISCOVER_RESULT" != "success" ]; then
@@ -184,18 +183,10 @@ jobs:
184183
exit 0
185184
fi
186185
187-
# Something testable changed. Manual dispatch and labeled PRs are
188-
# authorized to run the tests, so the gate reflects the test result.
189-
if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "$LABEL_PRESENT" = "true" ]; then
190-
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
191-
echo "All affected behavioral tests passed."
192-
exit 0
193-
fi
194-
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
195-
exit 1
186+
# Something testable changed: the gate reflects the test result.
187+
if [ "$BEHAVIORAL_RESULT" = "success" ]; then
188+
echo "All affected behavioral tests passed."
189+
exit 0
196190
fi
197-
198-
# Testable change on a PR with no label: require it.
199-
echo "::error::This PR changes a skill or behavioral test ($SKILLS) but the '${BEHAVIORAL_LABEL}' label is not set." >&2
200-
echo "Add the '${BEHAVIORAL_LABEL}' label to run the required behavioral tests for these changes." >&2
191+
echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
201192
exit 1

eval/behavioral/harness.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,16 @@ def check_api_reachable(model: str | None = DEFAULT_MODEL, timeout: int = 60) ->
9797
return False, "'claude' CLI not found on PATH"
9898

9999
model = _enforce_model_policy(model)
100-
cmd = [claude_bin, "-p", "Reply with the single word: ok", "--output-format", "json"]
100+
cmd = [claude_bin, "-p", "--output-format", "json"]
101101
if model:
102102
cmd += ["--model", model]
103103

104+
# Prompt goes over stdin (see `_run_agent` for why) -- consistent here even
105+
# though this one is single-line.
104106
try:
105107
proc = subprocess.run(
106108
cmd, capture_output=True, text=True, encoding="utf-8",
107-
stdin=subprocess.DEVNULL, timeout=timeout, env=_claude_env(),
109+
input="Reply with the single word: ok", timeout=timeout, env=_claude_env(),
108110
)
109111
except subprocess.TimeoutExpired:
110112
return False, f"API preflight timed out after {timeout}s (is the network reachable?)"
@@ -135,7 +137,7 @@ def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str
135137
raise RuntimeError("'claude' CLI not found on PATH")
136138

137139
cmd = [
138-
claude_bin, "-p", prompt_text,
140+
claude_bin, "-p",
139141
"--output-format", "stream-json", "--verbose",
140142
"--dangerously-skip-permissions",
141143
"--add-dir", str(workspace),
@@ -145,9 +147,14 @@ def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str
145147
if effort:
146148
cmd += ["--effort", effort]
147149

150+
# Pass the prompt over stdin rather than as an argv string. On Windows, when
151+
# `claude` resolves to a .cmd/.ps1 shim, a multi-line command-line argument
152+
# is re-parsed by cmd.exe/PowerShell and truncated at the first newline.
153+
# stdin is a raw byte stream and is immune to that on all platforms, so
154+
# multi-line test prompts stay intact.
148155
proc = subprocess.run(
149156
cmd, cwd=str(workspace), capture_output=True, text=True,
150-
encoding="utf-8", stdin=subprocess.DEVNULL, env=_claude_env(),
157+
encoding="utf-8", input=prompt_text, env=_claude_env(),
151158
)
152159

153160
events: list[dict] = []
@@ -229,7 +236,7 @@ def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tupl
229236
'{"pass": true|false, "reason": "<one short sentence>"}'
230237
)
231238
cmd = [
232-
claude_bin, "-p", prompt_text,
239+
claude_bin, "-p",
233240
"--output-format", "json",
234241
"--dangerously-skip-permissions",
235242
"--add-dir", str(run.workspace),
@@ -240,7 +247,7 @@ def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tupl
240247
try:
241248
proc = subprocess.run(
242249
cmd, capture_output=True, text=True, encoding="utf-8",
243-
stdin=subprocess.DEVNULL, timeout=180, env=_claude_env(),
250+
input=prompt_text, timeout=180, env=_claude_env(),
244251
)
245252
except subprocess.TimeoutExpired:
246253
return False, "llm_judge timed out after 180s"

0 commit comments

Comments
 (0)