Skip to content

Commit 1d95b72

Browse files
committed
Point to self-hosted runners
1 parent c8d6fe1 commit 1d95b72

1 file changed

Lines changed: 34 additions & 14 deletions

File tree

.github/workflows/behavioral.yml

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ name: behavioral
77
# * selective -- only the skills whose folder or test changed are run (the
88
# whole suite runs when the shared harness changes). See
99
# .github/scripts/select_behavioral.py.
10-
# * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY)
10+
# * label-gated execution -- the test job (which holds ORCHESTR_API_KEY)
1111
# only runs on manual dispatch or when a maintainer adds the
1212
# `run_behavioral` label, keeping the secret away from untrusted / fork
1313
# code that runs with tool permissions bypassed.
@@ -93,24 +93,39 @@ jobs:
9393
fi
9494
9595
behavioral:
96-
name: Behavioral (${{ matrix.skill }})
96+
name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
9797
needs: discover
9898
# Run only when something testable changed AND the run is authorized:
9999
# manual dispatch, or a maintainer added the `run_behavioral` label. This is
100-
# the gate that protects the ANTHROPIC_API_KEY secret.
100+
# the gate that protects the ORCHESTR_API_KEY secret.
101101
if: >-
102102
needs.discover.outputs.any == 'true' &&
103103
(github.event_name == 'workflow_dispatch' ||
104104
contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
105-
runs-on: ubuntu-latest
105+
# Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
106+
# the matrix below so each skill is exercised on both platforms.
107+
runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
106108
# Behavioral runs install local models and can take a while; cap it so a
107109
# hung agent or stalled model pull fails the job instead of burning minutes.
108110
timeout-minutes: 45
109111
strategy:
110-
# One skill failing should not hide the others' results.
112+
# One skill / OS failing should not hide the others' results.
111113
fail-fast: false
112114
matrix:
113115
skill: ${{ fromJson(needs.discover.outputs.skills) }}
116+
os: [Linux, Windows]
117+
env:
118+
# The CLI authenticates from this key and targets AMD's internal LLM
119+
# gateway. This job only runs on labeled PRs and manual dispatch (see this
120+
# job's `if:` above), keeping the secret away from untrusted / fork code.
121+
ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
122+
ANTHROPIC_BASE_URL: https://llm-api.amd.com
123+
# The gateway identifies the calling user via a custom header.
124+
ANTHROPIC_CUSTOM_HEADERS: "x-user: a1_ucicd"
125+
# Lets the harness default to this skill if a test relies on the env.
126+
BEHAVIORAL_SKILL: ${{ matrix.skill }}
127+
# Cost cap: sonnet only. The harness also enforces this under CI.
128+
BEHAVIORAL_MODEL: sonnet
114129
steps:
115130
- name: Check out repository
116131
uses: actions/checkout@v4
@@ -131,22 +146,27 @@ jobs:
131146
- name: Install behavioral test dependencies
132147
run: pip install -r eval/behavioral/requirements.txt
133148

134-
- name: Run behavioral test for ${{ matrix.skill }}
149+
- name: Run behavioral test for ${{ matrix.skill }} (Linux)
150+
if: matrix.os == 'Linux'
135151
working-directory: eval/behavioral
136-
env:
137-
# The CLI authenticates from this key. This job only runs on labeled
138-
# PRs and manual dispatch (see this job's `if:` above).
139-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
140-
# Lets the harness default to this skill if a test relies on the env.
141-
BEHAVIORAL_SKILL: ${{ matrix.skill }}
142-
# Cost cap: sonnet only. The harness also enforces this under CI.
143-
BEHAVIORAL_MODEL: sonnet
152+
shell: bash
144153
run: |
145154
set -euo pipefail
146155
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
147156
echo "Running $test_file"
148157
pytest "$test_file"
149158
159+
- name: Run behavioral test for ${{ matrix.skill }} (Windows)
160+
if: matrix.os == 'Windows'
161+
working-directory: eval/behavioral
162+
shell: pwsh
163+
run: |
164+
$ErrorActionPreference = "Stop"
165+
$skill = "${{ matrix.skill }}"
166+
$test_file = "tests/test_$($skill -replace '-','_').py"
167+
Write-Host "Running $test_file"
168+
pytest $test_file
169+
150170
# Single aggregate gate. Mark THIS check required in branch protection.
151171
#
152172
# * nothing testable changed -> pass (neutral).

0 commit comments

Comments
 (0)