claude-cookbooks/.github/workflows/notebook-tests.yml at main · anthropics/claude-cookbooks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
name: Notebook Tests

on:
  pull_request:
    paths:
      - '**/*.ipynb'
      - 'tests/notebook_tests/**'
      - 'pyproject.toml'
      - 'uv.lock'
  push:
    branches: [main]
    paths:
      - '**/*.ipynb'
      - 'tests/notebook_tests/**'

permissions:
  contents: read
  pull-requests: write
  id-token: write # Anthropic Workload Identity Federation

jobs:
  test-notebooks:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0  # Need full history for diff

      - name: Install uv
        uses: astral-sh/setup-uv@38f3f104447c67c051c4a08e39b64a148898af3a # v4.2.0 (sha-pinned)
        with:
          enable-cache: true
          cache-dependency-glob: "uv.lock"

      - name: Set up Python 3.11
        run: uv python install 3.11

      - name: Install dependencies
        run: uv sync --frozen --all-extras

      - name: Get changed notebooks
        id: changed-notebooks
        env:
          EVENT_NAME: ${{ github.event_name }}
          BASE_REF: ${{ github.base_ref }}
        run: |
          if [ "$EVENT_NAME" = "pull_request" ]; then
            # For PRs, get notebooks changed compared to base branch
            git fetch origin "$BASE_REF"
            CHANGED_NOTEBOOKS=$(git diff --name-only "origin/$BASE_REF"...HEAD | grep '\.ipynb$' || echo "")
          else
            # For push to main, get notebooks changed in the push
            CHANGED_NOTEBOOKS=$(git diff --name-only HEAD~1 HEAD | grep '\.ipynb$' || echo "")
          fi

          if [ -z "$CHANGED_NOTEBOOKS" ]; then
            echo "No notebooks changed"
            echo "has_notebooks=false" >> $GITHUB_OUTPUT
            echo "" > changed_notebooks.txt
          else
            echo "Changed notebooks:"
            echo "$CHANGED_NOTEBOOKS"
            echo "$CHANGED_NOTEBOOKS" > changed_notebooks.txt
            echo "has_notebooks=true" >> $GITHUB_OUTPUT

            # Count notebooks
            NOTEBOOK_COUNT=$(echo "$CHANGED_NOTEBOOKS" | wc -l | tr -d ' ')
            echo "notebook_count=$NOTEBOOK_COUNT" >> $GITHUB_OUTPUT
          fi

      - name: Run notebook structure tests
        id: structure-tests
        if: steps.changed-notebooks.outputs.has_notebooks == 'true'
        run: |
          echo "## Notebook Structure Tests" >> $GITHUB_STEP_SUMMARY

          FAILED_NOTEBOOKS=""
          PASSED_COUNT=0
          FAILED_COUNT=0

          while IFS= read -r notebook; do
            if [ -z "$notebook" ]; then
              continue
            fi

            echo "Testing: $notebook"

            # Run pytest on this specific notebook
            if uv run pytest tests/notebook_tests/test_notebooks.py \
                -v --tb=short \
                -m "not slow" \
                --notebook "$notebook" \
                2>&1 | tee "test_output_$(echo "$notebook" | tr '/' '_').txt"; then
              echo "✅ $notebook" >> $GITHUB_STEP_SUMMARY
              PASSED_COUNT=$((PASSED_COUNT + 1))
            else
              echo "❌ $notebook" >> $GITHUB_STEP_SUMMARY
              FAILED_NOTEBOOKS="$FAILED_NOTEBOOKS$notebook\n"
              FAILED_COUNT=$((FAILED_COUNT + 1))
            fi
          done < changed_notebooks.txt

          echo "" >> $GITHUB_STEP_SUMMARY
          echo "**Results:** $PASSED_COUNT passed, $FAILED_COUNT failed" >> $GITHUB_STEP_SUMMARY

          # Set outputs
          echo "passed_count=$PASSED_COUNT" >> $GITHUB_OUTPUT
          echo "failed_count=$FAILED_COUNT" >> $GITHUB_OUTPUT

          if [ "$FAILED_COUNT" -gt 0 ]; then
            echo "has_failures=true" >> $GITHUB_OUTPUT
            echo -e "$FAILED_NOTEBOOKS" > failed_notebooks.txt
          else
            echo "has_failures=false" >> $GITHUB_OUTPUT
          fi
        continue-on-error: true

      - name: Collect test results
        if: steps.changed-notebooks.outputs.has_notebooks == 'true'
        run: |
          # Combine all test outputs
          cat test_output_*.txt > all_test_output.txt 2>/dev/null || echo "No test output files"

      - name: Post test results to PR
        if: |
          github.event_name == 'pull_request' &&
          steps.changed-notebooks.outputs.has_notebooks == 'true' &&
          steps.structure-tests.outputs.has_failures == 'true'
        uses: anthropics/claude-code-action@bbfaf8e1ffe3e688f7ab65ceee78de241e24a238 # v1.0.132 (>=v1.0.130 for WIF inputs)
        with:
          # Anthropic auth via Workload Identity Federation — the action
          # exchanges this job's GitHub OIDC token (id-token: write above)
          # for a short-lived access token instead of a static API key.
          anthropic_federation_rule_id: fdrl_01SqmTwzmEE547mtaYN1mqHL
          anthropic_organization_id: 1ec12c5c-6542-4da8-bf2f-c15919aef01c
          anthropic_service_account_id: svac_01BHcCBa1UWFvNrHMqJjuaUZ
          github_token: ${{ secrets.GITHUB_TOKEN }}
          prompt: |
            The notebook tests found issues in the changed notebooks.

            Test results: ${{ steps.structure-tests.outputs.passed_count }} passed, ${{ steps.structure-tests.outputs.failed_count }} failed

            Here is the test output:
            ```
            $(cat all_test_output.txt | head -200)
            ```

            Create a helpful PR comment that:
            - Lists which notebooks failed and why
            - Groups similar issues (e.g., "cells not executed", "execution order issues")
            - Explains how to fix common issues:
              - "Cells not executed": Run all cells from top to bottom before committing
              - "Execution order issues": Restart kernel and run all cells sequentially
              - "Deprecated models": Update to current model versions (claude-sonnet-4-6, etc.)
              - "Hardcoded API keys": Use os.environ.get("ANTHROPIC_API_KEY") instead
            - Mentions they can test locally with: `make test-notebooks NOTEBOOK=path/to/notebook.ipynb`
            - Uses friendly, constructive language

            Post using: gh pr comment $PR_NUMBER --body "your comment"
          claude_args: |
            --allowedTools "Bash(gh pr comment:*),Bash(cat:*),Read"
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}

      # TODO: this step still reads the static ANTHROPIC_API_KEY secret. The
      # claude-code-action step above uses Workload Identity Federation; this
      # direct-API step needs a separate inline OIDC mint+exchange (or the
      # anthropic SDK's WIF env-var trio). Gracefully skips when the secret
      # is absent (the `[ -z "$ANTHROPIC_API_KEY" ]` guard below).
      - name: Run notebook execution tests (maintainers only)
        id: execution-tests
        if: |
          steps.changed-notebooks.outputs.has_notebooks == 'true' &&
          (github.event_name == 'push' ||
           github.event.pull_request.author_association == 'MEMBER' ||
           github.event.pull_request.author_association == 'OWNER')
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          echo "## Notebook Execution Tests" >> $GITHUB_STEP_SUMMARY

          # Only run if API key is available
          if [ -z "$ANTHROPIC_API_KEY" ]; then
            echo "⚠️ Skipping execution tests - no API key available" >> $GITHUB_STEP_SUMMARY
            exit 0
          fi

          mkdir -p execution_outputs
          EXEC_FAILED=0

          while IFS= read -r notebook; do
            if [ -z "$notebook" ]; then
              continue
            fi

            echo "Executing: $notebook"

            # Run execution test with timeout
            if timeout 300 uv run pytest tests/notebook_tests/test_notebooks.py \
                -v --tb=long \
                --execute-notebooks \
                --notebook-timeout 240 \
                --notebook "$notebook" \
                -k "test_notebook_executes_successfully" \
                2>&1 | tee "execution_outputs/$(echo "$notebook" | tr '/' '_').txt"; then
              echo "✅ Executed: $notebook" >> $GITHUB_STEP_SUMMARY
            else
              echo "❌ Failed: $notebook" >> $GITHUB_STEP_SUMMARY
              EXEC_FAILED=$((EXEC_FAILED + 1))
            fi
          done < changed_notebooks.txt

          if [ "$EXEC_FAILED" -gt 0 ]; then
            echo "exec_failures=$EXEC_FAILED" >> $GITHUB_OUTPUT
          fi
        continue-on-error: true

      - name: Upload test artifacts
        if: always() && steps.changed-notebooks.outputs.has_notebooks == 'true'
        uses: actions/upload-artifact@v4
        with:
          name: notebook-test-results
          path: |
            test_output_*.txt
            all_test_output.txt
            failed_notebooks.txt
            execution_outputs/
          retention-days: 7
          if-no-files-found: ignore

      - name: Final status check
        if: steps.changed-notebooks.outputs.has_notebooks == 'true'
        run: |
          if [ "${{ steps.structure-tests.outputs.has_failures }}" = "true" ]; then
            echo "❌ Some notebook tests failed. Please fix the issues above."
            exit 1
          fi
          echo "✅ All notebook tests passed!"

      - name: No notebooks changed
        if: steps.changed-notebooks.outputs.has_notebooks == 'false'
        run: |
          echo "✅ No notebooks were changed in this PR/push"
          echo "No notebooks to test" >> $GITHUB_STEP_SUMMARY