skills/.github/workflows/behavioral.yml at main · amd/skills · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
name: behavioral

# Behavioral tests run a real agent against a skill and grade what it did (see
# eval/behavioral/). They cost real API tokens and, for some skills, install
# and exercise local models. The design:
#
#   * selective -- only the skills whose folder or test changed are run (the
#     whole suite runs when the shared harness changes). See
#     .github/scripts/select_behavioral.py.
#   * required when relevant -- when a PR changes a skill or test that maps to a
#     behavioral test, the `behavioral` gate FAILS until the tests pass. A PR
#     that touches nothing testable passes neutrally.
#   * dispatchable -- run any subset by hand from the Actions tab.
#
# Shape mirrors validate.yml: discover -> matrix -> single aggregate gate, so
# branch protection can require just the `behavioral` check.

on:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      - "skills/**"
      - "eval/behavioral/**"
      - "eval/claude_eval.py"
      - ".github/workflows/behavioral.yml"
      - ".github/scripts/select_behavioral.py"
  workflow_dispatch:
    inputs:
      skills:
        description: "Comma-separated skill names to test (blank = every skill that has a behavioral test)."
        required: false
        default: ""

concurrency:
  group: behavioral-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  # Decide which skills the change affects. This is secret-free (just git diff +
  # a Python mapping). Its `any` output drives whether the behavioral job runs
  # and whether the gate has anything to enforce for this PR.
  discover:
    name: Select behavioral tests
    runs-on: ubuntu-latest
    outputs:
      skills: ${{ steps.select.outputs.skills }}
      any: ${{ steps.select.outputs.any }}
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
        with:
          # Need the merge base so `git diff` can see what the PR changed.
          fetch-depth: 0

      - name: Set up uv
        uses: astral-sh/setup-uv@v7

      - name: Select skills
        id: select
        run: |
          set -euo pipefail
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            if [ -n "${{ github.event.inputs.skills }}" ]; then
              skills=$(uv run .github/scripts/select_behavioral.py --names "${{ github.event.inputs.skills }}")
            else
              skills=$(uv run .github/scripts/select_behavioral.py --all)
            fi
          else
            base="${{ github.event.pull_request.base.sha }}"
            head="${{ github.event.pull_request.head.sha }}"
            skills=$(git diff --name-only "$base" "$head" \
              | uv run .github/scripts/select_behavioral.py --changed)
          fi
          echo "Selected skills: $skills"
          echo "skills=$skills" >> "$GITHUB_OUTPUT"
          if [ "$skills" = "[]" ]; then
            echo "any=false" >> "$GITHUB_OUTPUT"
          else
            echo "any=true" >> "$GITHUB_OUTPUT"
          fi

  behavioral:
    name: Behavioral (${{ matrix.skill }} on ${{ matrix.os }})
    needs: discover
    # Run whenever the change affects something testable.
    if: needs.discover.outputs.any == 'true'
    # Self-hosted Strix Halo runners. The OS label (Linux / Windows) comes from
    # the matrix below so each skill is exercised on both platforms.
    runs-on: [self-hosted, strix_halo, "${{ matrix.os }}"]
    # Behavioral runs install local models and can take a while; cap it so a
    # hung agent or stalled model pull fails the job instead of burning minutes.
    timeout-minutes: 45
    strategy:
      # One skill / OS failing should not hide the others' results.
      fail-fast: false
      matrix:
        skill: ${{ fromJson(needs.discover.outputs.skills) }}
        os: [Linux, Windows]
    env:
      ANTHROPIC_API_KEY: ${{ secrets.ORCHESTR_API_KEY }}
      ANTHROPIC_BASE_URL: https://llm-api.amd.com/Anthropic
      ANTHROPIC_CUSTOM_HEADERS: |
        Ocp-Apim-Subscription-Key: ${{ secrets.ORCHESTR_API_KEY }}
        user: a1_ucicd
      # Lets the harness default to this skill if a test relies on the env.
      BEHAVIORAL_SKILL: ${{ matrix.skill }}
      # Cost cap: sonnet only. The harness also enforces this under CI.
      BEHAVIORAL_MODEL: sonnet
    steps:
      - name: Check out repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Set up Node
        uses: actions/setup-node@v4
        with:
          node-version: "20"

      - name: Install the claude CLI
        run: npm install -g @anthropic-ai/claude-code

      - name: Install behavioral test dependencies
        run: pip install -r eval/behavioral/requirements.txt

      - name: Run behavioral test for ${{ matrix.skill }} (Linux)
        if: matrix.os == 'Linux'
        working-directory: eval/behavioral
        shell: bash
        run: |
          set -euo pipefail
          test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
          echo "Running $test_file"
          pytest "$test_file"

      - name: Run behavioral test for ${{ matrix.skill }} (Windows)
        if: matrix.os == 'Windows'
        working-directory: eval/behavioral
        shell: powershell
        run: |
          $ErrorActionPreference = "Stop"
          $skill = "${{ matrix.skill }}"
          $test_file = "tests/test_$($skill -replace '-','_').py"
          Write-Host "Running $test_file"
          pytest $test_file

  # Single aggregate gate. Mark THIS check required in branch protection.
  #
  #   * nothing testable changed -> pass (neutral).
  #   * testable change          -> pass iff the behavioral job passed.
  behavioral-gate:
    name: behavioral
    needs: [discover, behavioral]
    if: always()
    runs-on: ubuntu-latest
    env:
      DISCOVER_RESULT: ${{ needs.discover.result }}
      BEHAVIORAL_RESULT: ${{ needs.behavioral.result }}
      AFFECTED: ${{ needs.discover.outputs.any }}
      SKILLS: ${{ needs.discover.outputs.skills }}
    steps:
      - name: Verify behavioral results
        run: |
          echo "discover:    $DISCOVER_RESULT"
          echo "behavioral:  $BEHAVIORAL_RESULT"
          echo "affected:    $AFFECTED ($SKILLS)"

          # If discovery itself failed, surface that rather than guessing.
          if [ "$DISCOVER_RESULT" != "success" ]; then
            echo "The discover job did not succeed ($DISCOVER_RESULT)." >&2
            exit 1
          fi

          # No skill or behavioral test changed: nothing to gate on.
          if [ "$AFFECTED" != "true" ]; then
            echo "No behavioral tests affected by this change."
            exit 0
          fi

          # Something testable changed: the gate reflects the test result.
          if [ "$BEHAVIORAL_RESULT" = "success" ]; then
            echo "All affected behavioral tests passed."
            exit 0
          fi
          echo "One or more behavioral tests failed ($BEHAVIORAL_RESULT)." >&2
          exit 1