@@ -7,7 +7,7 @@ name: behavioral
77# * selective -- only the skills whose folder or test changed are run (the
88# whole suite runs when the shared harness changes). See
99# .github/scripts/select_behavioral.py.
10- # * label-gated execution -- the test job (which holds ANTHROPIC_API_KEY )
10+ # * label-gated execution -- the test job (which holds ORCHESTR_API_KEY )
1111# only runs on manual dispatch or when a maintainer adds the
1212# `run_behavioral` label, keeping the secret away from untrusted / fork
1313# code that runs with tool permissions bypassed.
@@ -93,24 +93,39 @@ jobs:
9393 fi
9494
9595 behavioral :
96- name : Behavioral (${{ matrix.skill }})
96+ name : Behavioral (${{ matrix.skill }} on ${{ matrix.os }} )
9797 needs : discover
9898 # Run only when something testable changed AND the run is authorized:
9999 # manual dispatch, or a maintainer added the `run_behavioral` label. This is
100- # the gate that protects the ANTHROPIC_API_KEY secret.
100+ # the gate that protects the ORCHESTR_API_KEY secret.
101101 if : >-
102102 needs.discover.outputs.any == 'true' &&
103103 (github.event_name == 'workflow_dispatch' ||
104104 contains(github.event.pull_request.labels.*.name, 'run_behavioral'))
105- runs-on : ubuntu-latest
105+ # Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
106+ # the matrix below so each skill is exercised on both platforms.
107+ runs-on : [self-hosted, strix_halo, "${{ matrix.os }}"]
106108 # Behavioral runs install local models and can take a while; cap it so a
107109 # hung agent or stalled model pull fails the job instead of burning minutes.
108110 timeout-minutes : 45
109111 strategy :
110- # One skill failing should not hide the others' results.
112+ # One skill / OS failing should not hide the others' results.
111113 fail-fast : false
112114 matrix :
113115 skill : ${{ fromJson(needs.discover.outputs.skills) }}
116+ os : [Linux, Windows]
117+ env :
118+ # The CLI authenticates from this key and targets AMD's internal LLM
119+ # gateway. This job only runs on labeled PRs and manual dispatch (see this
120+ # job's `if:` above), keeping the secret away from untrusted / fork code.
121+ ANTHROPIC_API_KEY : ${{ secrets.ORCHESTR_API_KEY }}
122+ ANTHROPIC_BASE_URL : https://llm-api.amd.com
123+ # The gateway identifies the calling user via a custom header.
124+ ANTHROPIC_CUSTOM_HEADERS : " x-user: a1_ucicd"
125+ # Lets the harness default to this skill if a test relies on the env.
126+ BEHAVIORAL_SKILL : ${{ matrix.skill }}
127+ # Cost cap: sonnet only. The harness also enforces this under CI.
128+ BEHAVIORAL_MODEL : sonnet
114129 steps :
115130 - name : Check out repository
116131 uses : actions/checkout@v4
@@ -131,22 +146,27 @@ jobs:
131146 - name : Install behavioral test dependencies
132147 run : pip install -r eval/behavioral/requirements.txt
133148
134- - name : Run behavioral test for ${{ matrix.skill }}
149+ - name : Run behavioral test for ${{ matrix.skill }} (Linux)
150+ if : matrix.os == 'Linux'
135151 working-directory : eval/behavioral
136- env :
137- # The CLI authenticates from this key. This job only runs on labeled
138- # PRs and manual dispatch (see this job's `if:` above).
139- ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
140- # Lets the harness default to this skill if a test relies on the env.
141- BEHAVIORAL_SKILL : ${{ matrix.skill }}
142- # Cost cap: sonnet only. The harness also enforces this under CI.
143- BEHAVIORAL_MODEL : sonnet
152+ shell : bash
144153 run : |
145154 set -euo pipefail
146155 test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
147156 echo "Running $test_file"
148157 pytest "$test_file"
149158
159+ - name : Run behavioral test for ${{ matrix.skill }} (Windows)
160+ if : matrix.os == 'Windows'
161+ working-directory : eval/behavioral
162+ shell : pwsh
163+ run : |
164+ $ErrorActionPreference = "Stop"
165+ $skill = "${{ matrix.skill }}"
166+ $test_file = "tests/test_$($skill -replace '-','_').py"
167+ Write-Host "Running $test_file"
168+ pytest $test_file
169+
150170 # Single aggregate gate. Mark THIS check required in branch protection.
151171 #
152172 # * nothing testable changed -> pass (neutral).
0 commit comments