Add vLLM multi-instance CPU benchmark skill #37
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: behavioral | |
| # Behavioral tests run a real agent against a skill and grade what it did (see | |
| # eval/behavioral/). They cost real API tokens and, for some skills, install | |
| # and exercise local models. The design: | |
| # | |
| # * selective -- only the skills whose folder or test changed are run (the | |
| # whole suite runs when the shared harness changes). See | |
| # .github/scripts/select_behavioral.py. | |
| # * required when relevant -- when a PR changes a skill or test that maps to a | |
| # behavioral test, the `behavioral` gate FAILS until the tests pass. A PR | |
| # that touches nothing testable passes neutrally. | |
| # * dispatchable -- run any subset by hand from the Actions tab. | |
| # | |
| # Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so | |
| # branch protection can require just the `behavioral` check. | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened] | |
| paths: | |
| - "skills/**" | |
| - "eval/behavioral/**" | |
| - "eval/claude_eval.py" | |
| - ".github/workflows/behavioral.yml" | |
| - ".github/scripts/select_behavioral.py" | |
| workflow_dispatch: | |
| inputs: | |
| skills: | |
| description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)." | |
| required: false | |
| default: "" | |
| concurrency: | |
| group: behavioral-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| jobs: | |
| # Decide which skills the change affects. This is secret-free (just git diff + | |
| # a Python mapping). Its `any` output drives whether the behavioral job runs | |
| # and whether the gate has anything to enforce for this PR. | |
| discover: | |
| name: Select behavioral tests | |
| runs-on: ubuntu-latest | |
| outputs: | |
| skills: ${{ steps.select.outputs.skills }} | |
| any: ${{ steps.select.outputs.any }} | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| with: | |
| # Need the merge base so `git diff` can see what the PR changed. | |
| fetch-depth: 0 | |
| - name: Set up uv | |
| uses: astral-sh/setup-uv@v7 | |
| - name: Select skills | |
| id: select | |
| run: | | |
| set -euo pipefail | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| if [ -n "${{ github.event.inputs.skills }}" ]; then | |
| skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}") | |
| else | |
| skills=$(uv run .github/scripts/select_behavioral.py --all) | |
| fi | |
| else | |
| base="${{ github.event.pull_request.base.sha }}" | |
| head="${{ github.event.pull_request.head.sha }}" | |
| skills=$(git diff --name-only "$base" "$head" \ | |
| | uv run .github/scripts/select_behavioral.py --changed) | |
| fi | |
| echo "Selected skills: $skills" | |
| echo "skills=$skills" >> "$GITHUB_OUTPUT" | |
| if [ "$skills" = "[]" ]; then | |
| echo "any=false" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "any=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| behavioral: | |
| name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }}) | |
| needs: discover | |
| # Run whenever the change affects something testable. | |
| if: needs.discover.outputs.any == 'true' | |
| # Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from | |
| # the matrix below so each skill is exercised on both platforms. | |
| runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"] | |
| # Behavioral runs install local models and can take a while; cap it so a | |
| # hung agent or stalled model pull fails the job instead of burning minutes. | |
| timeout-minutes: 45 | |
| strategy: | |
| # One skill / OS failing should not hide the others' results. | |
| fail-fast: false | |
| matrix: | |
| skill: ${{ fromJson(needs.discover.outputs.skills) }} | |
| os: [Linux, Windows] | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }} | |
| ANTHROPIC_BASE_URL: https://llm-api.amd.com/Anthropic | |
| ANTHROPIC_CUSTOM_HEADERS: | | |
| Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }} | |
| user: a1_ucicd | |
| # Lets the harness default to this skill if a test relies on the env. | |
| BEHAVIORAL_SKILL: ${{ matrix.skill }} | |
| # Cost cap: sonnet only. The harness also enforces this under CI. | |
| BEHAVIORAL_MODEL: sonnet | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Set up Node | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| - name: Install the claude CLI | |
| run: npm install -g @anthropic-ai/claude-code | |
| - name: Install behavioral test dependencies | |
| run: pip install -r eval/behavioral/requirements.txt | |
| - name: Run behavioral test for ${{ matrix.skill }} (Linux) | |
| if: matrix.os == 'Linux' | |
| working-directory: eval/behavioral | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py" | |
| echo "Running $test_file" | |
| pytest "$test_file" | |
| - name: Run behavioral test for ${{ matrix.skill }} (Windows) | |
| if: matrix.os == 'Windows' | |
| working-directory: eval/behavioral | |
| shell: powershell | |
| run: | | |
| $ErrorActionPreference = "Stop" | |
| $skill = "${{ matrix.skill }}" | |
| $test_file = "tests/test_$($skill -replace '-','_').py" | |
| Write-Host "Running $test_file" | |
| pytest $test_file | |
| # Single aggregate gate. Mark THIS check required in branch protection. | |
| # | |
| # * nothing testable changed -> pass (neutral). | |
| # * testable change -> pass iff the behavioral job passed. | |
| behavioral-gate: | |
| name: behavioral | |
| needs: [discover, behavioral] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| env: | |
| DISCOVER_RESULT: ${{ needs.discover.result }} | |
| BEHAVIORAL_RESULT: ${{ needs.behavioral.result }} | |
| AFFECTED: ${{ needs.discover.outputs.any }} | |
| SKILLS: ${{ needs.discover.outputs.skills }} | |
| steps: | |
| - name: Verify behavioral results | |
| run: | | |
| echo "discover: $DISCOVER_RESULT" | |
| echo "behavioral: $BEHAVIORAL_RESULT" | |
| echo "affected: $AFFECTED ($SKILLS)" | |
| # If discovery itself failed, surface that rather than guessing. | |
| if [ "$DISCOVER_RESULT" != "success" ]; then | |
| echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2 | |
| exit 1 | |
| fi | |
| # No skill or behavioral test changed: nothing to gate on. | |
| if [ "$AFFECTED" != "true" ]; then | |
| echo "No behavioral tests affected by this change." | |
| exit 0 | |
| fi | |
| # Something testable changed: the gate reflects the test result. | |
| if [ "$BEHAVIORAL_RESULT" = "success" ]; then | |
| echo "All affected behavioral tests passed." | |
| exit 0 | |
| fi | |
| echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2 | |
| exit 1 |