Claude Diagnose Workflow Failure #595
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Claude Diagnose Workflow Failure | |
| # Triggers when any workflow run completes with failure | |
| # Analyzes if it's a GitHub Actions configuration problem vs a test/code failure | |
| # or transient infrastructure issue (external service availability problems) | |
| # Creates an issue only for Actions configuration problems | |
| # (not test failures or transient infrastructure issues) | |
| # SECURITY NOTE: This workflow has lower prompt injection risk because: | |
| # - It only triggers on workflow_run events (not user-controlled events) | |
| # - It analyzes workflow logs and YAML files, not user-provided content | |
| # - The Claude prompt is constructed from workflow metadata, not PR/issue content | |
| # - It only has read permissions for contents/actions, write only for issues | |
| # | |
| # However, malicious workflow log output could theoretically influence Claude's analysis. | |
| # This is an acceptable risk since the worst case is creating an unnecessary issue. | |
| on: | |
| workflow_run: | |
| workflows: | |
| - "PR Tests" | |
| - "Claude Code" | |
| - "Claude Code Review" | |
| - "Deploy to GitHub Pages" | |
| - "Review Coverage Evaluator" | |
| types: | |
| - completed | |
| # Prevent duplicate diagnoses for the same workflow run | |
| concurrency: | |
| group: diagnose-failure-${{ github.event.workflow_run.id }} | |
| cancel-in-progress: false | |
| env: | |
| DEVCONTAINER_IMAGE: ghcr.io/nickborgersprobably/hide-my-list-devcontainer | |
| jobs: | |
| # Only proceed if the workflow failed | |
| check-failure: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| should_diagnose: ${{ steps.check.outputs.should_diagnose }} | |
| workflow_name: ${{ github.event.workflow_run.name }} | |
| run_id: ${{ github.event.workflow_run.id }} | |
| run_url: ${{ github.event.workflow_run.html_url }} | |
| head_branch: ${{ github.event.workflow_run.head_branch }} | |
| head_sha: ${{ github.event.workflow_run.head_sha }} | |
| conclusion: ${{ github.event.workflow_run.conclusion }} | |
| steps: | |
| - name: Check if diagnosis is needed | |
| id: check | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| CONCLUSION="${{ github.event.workflow_run.conclusion }}" | |
| WORKFLOW_NAME="${{ github.event.workflow_run.name }}" | |
| RUN_ID="${{ github.event.workflow_run.id }}" | |
| echo "Workflow: $WORKFLOW_NAME" | |
| echo "Conclusion: $CONCLUSION" | |
| echo "Run ID: $RUN_ID" | |
| # Only diagnose failures (not success, cancelled, or skipped) | |
| if [ "$CONCLUSION" != "failure" ]; then | |
| echo "Workflow did not fail (conclusion: $CONCLUSION) - skipping diagnosis" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Skip diagnosis for THIS workflow only to avoid infinite loops | |
| # Other Claude workflows (Claude Code, Claude Code Review) should be diagnosed | |
| if [[ "$WORKFLOW_NAME" == "Claude Diagnose Workflow Failure" ]]; then | |
| echo "Skipping diagnosis of diagnosis workflow to prevent infinite loops" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Check if an issue already exists for this workflow run | |
| EXISTING_ISSUE=$(gh issue list \ | |
| --repo ${{ github.repository }} \ | |
| --search "Workflow Failure: $WORKFLOW_NAME run #$RUN_ID" \ | |
| --state open \ | |
| --json number \ | |
| --jq '.[0].number // empty') | |
| if [ -n "$EXISTING_ISSUE" ]; then | |
| echo "Issue #$EXISTING_ISSUE already exists for this run - skipping" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "should_diagnose=true" >> $GITHUB_OUTPUT | |
| # Build devcontainer for Claude | |
| build-devcontainer: | |
| runs-on: [self-hosted, homelab] | |
| needs: check-failure | |
| if: needs.check-failure.outputs.should_diagnose == 'true' | |
| permissions: | |
| contents: read | |
| packages: write | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Build and push devcontainer | |
| uses: devcontainers/ci@v0.3 | |
| continue-on-error: true | |
| with: | |
| imageName: ${{ env.DEVCONTAINER_IMAGE }} | |
| cacheFrom: ${{ env.DEVCONTAINER_IMAGE }} | |
| push: always | |
| # Run Claude to diagnose the failure | |
| diagnose-failure: | |
| needs: [check-failure, build-devcontainer] | |
| if: needs.check-failure.outputs.should_diagnose == 'true' | |
| runs-on: [self-hosted, homelab] | |
| permissions: | |
| contents: read | |
| issues: write | |
| actions: read | |
| packages: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Diagnose failure with Claude | |
| uses: devcontainers/ci@v0.3 | |
| with: | |
| imageName: ${{ env.DEVCONTAINER_IMAGE }} | |
| cacheFrom: ${{ env.DEVCONTAINER_IMAGE }} | |
| push: never | |
| env: | | |
| CLAUDE_CODE_OAUTH_TOKEN=${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} | |
| GH_TOKEN=${{ secrets.WORKFLOW_PAT }} | |
| WORKFLOW_NAME=${{ needs.check-failure.outputs.workflow_name }} | |
| RUN_ID=${{ needs.check-failure.outputs.run_id }} | |
| RUN_URL=${{ needs.check-failure.outputs.run_url }} | |
| HEAD_BRANCH=${{ needs.check-failure.outputs.head_branch }} | |
| HEAD_SHA=${{ needs.check-failure.outputs.head_sha }} | |
| REPO=${{ github.repository }} | |
| runCmd: | | |
| # Run Claude to diagnose the workflow failure | |
| # Using Sonnet: Classification task with clear decision tree (3 categories) | |
| claude --print \ | |
| --verbose \ | |
| --output-format stream-json \ | |
| --model sonnet \ | |
| --dangerously-skip-permissions \ | |
| --max-turns 100 \ | |
| "You are a GitHub Actions diagnostician. A workflow has failed and you need to determine | |
| if it's a GitHub Actions configuration problem or a normal test/code failure. | |
| WORKFLOW INFORMATION: | |
| - Workflow Name: ${WORKFLOW_NAME} | |
| - Run ID: ${RUN_ID} | |
| - Run URL: ${RUN_URL} | |
| - Branch: ${HEAD_BRANCH} | |
| - Commit SHA: ${HEAD_SHA} | |
| - Repository: ${REPO} | |
| YOUR TASK: | |
| 1. Fetch the workflow run logs using: gh run view ${RUN_ID} --log-failed | |
| 2. Read the workflow YAML file: .github/workflows/ (find the matching file for '${WORKFLOW_NAME}') | |
| 3. Analyze the failure to classify it as one of: | |
| a) TEST_FAILURE: Unit tests, integration tests, or code compilation failures | |
| b) CONFIG_FAILURE: Issues in configuration files (YAML validation, etc.) | |
| c) INFRASTRUCTURE_FAILURE: Transient external service availability issues | |
| d) ACTIONS_FAILURE: Problems with the GitHub Actions workflow definition itself | |
| INFRASTRUCTURE_FAILURE examples (do NOT create an issue for these): | |
| - GitHub Cache Service errors (502, 503, timeouts, EOF errors) | |
| - Container registry availability issues (MCR, GHCR, Docker Hub being temporarily unavailable) | |
| - Network connectivity issues to external services (EOF, connection reset, timeouts) | |
| - GitHub Actions Cache service returning 'Unicorn' error pages | |
| - Rate limiting from external services | |
| - DNS resolution failures for external registries | |
| - TLS/SSL handshake failures to external services | |
| - Any error message containing '502', '503', '504' from external services | |
| - Errors like 'failed to solve: Unavailable: error reading from server: EOF' | |
| ACTIONS_FAILURE examples (CREATE AN ISSUE for these): | |
| - Workflow YAML syntax errors | |
| - Missing or invalid action references (e.g., uses: unknown-action@v1) | |
| - Invalid workflow triggers or event configurations | |
| - Missing required secrets or environment variables (not test code) | |
| - Permission issues with GitHub tokens/actions | |
| - Docker build failures in workflow steps (not Dockerfile issues) | |
| - Job dependency issues | |
| - Concurrency/matrix configuration problems | |
| - Runner environment issues | |
| NOT ACTIONS_FAILURE (do NOT create an issue): | |
| - Script lint failures (shellcheck) | |
| - YAML validation failures (yamllint) | |
| - Documentation validation failures | |
| - Coverage below threshold | |
| - Application Dockerfile build failures | |
| - GitHub Pages build failures (Jekyll/Ruby errors) | |
| - Review coverage evaluation failures (post-merge analysis errors) | |
| DECISION LOGIC: | |
| - If the failure is ACTIONS_FAILURE: Create a GitHub issue | |
| - If the failure is TEST_FAILURE, CONFIG_FAILURE, or INFRASTRUCTURE_FAILURE: Do NOT create an issue | |
| (TEST_FAILURE and CONFIG_FAILURE are handled by the existing Claude Code Review workflow) | |
| (INFRASTRUCTURE_FAILURE issues are transient external problems that will self-resolve) | |
| IF YOU DETERMINE THIS IS AN ACTIONS_FAILURE, create an issue: | |
| # First ensure the github-actions label exists | |
| gh label create \"github-actions\" --color \"0366d6\" --description \"GitHub Actions workflow issues\" 2>/dev/null || true | |
| gh issue create \\ | |
| --title \"Workflow Failure: ${WORKFLOW_NAME} run #${RUN_ID} - Actions Configuration Issue\" \\ | |
| --assignee NickBorgers \\ | |
| --label \"bug,github-actions\" \\ | |
| --body \"\$(cat <<'ISSUE_BODY' | |
| ## Workflow Failure Report | |
| A GitHub Actions configuration problem has been detected. | |
| ### Workflow Details | |
| - **Workflow**: ${WORKFLOW_NAME} | |
| - **Run ID**: [${RUN_ID}](${RUN_URL}) | |
| - **Branch**: ${HEAD_BRANCH} | |
| - **Commit**: ${HEAD_SHA} | |
| ### Failure Analysis | |
| [DESCRIBE THE ACTIONS CONFIGURATION PROBLEM HERE] | |
| ### Relevant Logs | |
| \\\`\\\`\\\` | |
| [PASTE KEY ERROR LOGS HERE] | |
| \\\`\\\`\\\` | |
| ### Suggested Fix | |
| [DESCRIBE HOW TO FIX THE ACTIONS CONFIGURATION] | |
| ### Files to Review | |
| - [LIST WORKFLOW YAML FILES THAT NEED CHANGES] | |
| --- | |
| Generated by Claude Workflow Diagnostician | |
| ISSUE_BODY | |
| )\" | |
| IF THIS IS NOT AN ACTIONS_FAILURE, output: | |
| 'DIAGNOSIS: [TEST_FAILURE|CONFIG_FAILURE|INFRASTRUCTURE_FAILURE] - No issue created.' | |
| For TEST_FAILURE/CONFIG_FAILURE: 'This failure type is handled by the Claude Code Review workflow.' | |
| For INFRASTRUCTURE_FAILURE: 'This is a transient external service issue that will self-resolve. No action needed.' | |
| Be thorough in your analysis. Examine the actual error messages in the logs." < /dev/null 2>&1 | tee /tmp/diagnose-output.jsonl | |
| - name: Upload diagnosis output | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| continue-on-error: true | |
| with: | |
| name: workflow-diagnosis-output | |
| path: /tmp/diagnose-output.jsonl | |
| retention-days: 7 | |
| - name: Notify agent webhook | |
| if: always() | |
| continue-on-error: true | |
| run: | | |
| # Signal the OpenClaw agent that a workflow failure was diagnosed | |
| WEBHOOK_URL="${{ vars.AGENT_WEBHOOK_URL }}" | |
| if [ -n "$WEBHOOK_URL" ]; then | |
| curl -s --max-time 5 "$WEBHOOK_URL" >/dev/null 2>&1 || true | |
| fi |