Codex Diagnose Workflow Failure #2438
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| name: Codex Diagnose Workflow Failure | |
| # Triggers when any workflow run completes with failure | |
| # Analyzes if it's a GitHub Actions configuration problem vs a test/code failure | |
| # or transient infrastructure issue (external service availability problems) | |
| # Creates an issue only for Actions configuration problems | |
| # (not test failures or transient infrastructure issues) | |
| # SECURITY NOTE: This workflow has lower prompt injection risk because: | |
| # - It only triggers on workflow_run events (not user-controlled events) | |
| # - It analyzes workflow logs and YAML files, not user-provided content | |
| # - The Codex prompt is constructed from workflow metadata, not PR/issue content | |
| # - It only has read permissions for contents/actions, write only for issues | |
| # | |
| # However, malicious workflow log output could theoretically influence Codex's analysis. | |
| # This is an acceptable risk since the worst case is creating an unnecessary issue. | |
| on: | |
| workflow_run: | |
| workflows: | |
| - "PR Tests" | |
| - "Codex Agent" | |
| - "Codex Code Review" | |
| - "Deploy to GitHub Pages" | |
| - "Review Coverage Evaluator" | |
| types: | |
| - completed | |
| # Prevent duplicate diagnoses for the same workflow/branch/commit failure. | |
| concurrency: | |
| group: diagnose-failure-${{ github.event.workflow_run.name }}-${{ github.event.workflow_run.head_branch || 'no-branch' }}-${{ github.event.workflow_run.head_sha || github.event.workflow_run.id }} | |
| cancel-in-progress: false | |
| env: | |
| DEVCONTAINER_IMAGE: ghcr.io/nickborgersprobably/hide-my-list-devcontainer | |
| jobs: | |
| # Only proceed if the workflow failed | |
| check-failure: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| issues: read | |
| outputs: | |
| should_diagnose: ${{ steps.check.outputs.should_diagnose }} | |
| workflow_name: ${{ github.event.workflow_run.name }} | |
| run_id: ${{ github.event.workflow_run.id }} | |
| run_url: ${{ github.event.workflow_run.html_url }} | |
| head_branch: ${{ github.event.workflow_run.head_branch }} | |
| head_sha: ${{ github.event.workflow_run.head_sha }} | |
| conclusion: ${{ github.event.workflow_run.conclusion }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| - name: Check if diagnosis is needed | |
| id: check | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| CONCLUSION="${{ github.event.workflow_run.conclusion }}" | |
| WORKFLOW_NAME="${{ github.event.workflow_run.name }}" | |
| RUN_ID="${{ github.event.workflow_run.id }}" | |
| echo "Workflow: $WORKFLOW_NAME" | |
| echo "Conclusion: $CONCLUSION" | |
| echo "Run ID: $RUN_ID" | |
| # Only diagnose failures (not success, cancelled, or skipped) | |
| if [ "$CONCLUSION" != "failure" ]; then | |
| echo "Workflow did not fail (conclusion: $CONCLUSION) - skipping diagnosis" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Skip diagnosis for THIS workflow only to avoid infinite loops | |
| # Other Codex workflows (Codex Agent, Codex Code Review) should be diagnosed | |
| if [[ "$WORKFLOW_NAME" == "Codex Diagnose Workflow Failure" ]]; then | |
| echo "Skipping diagnosis of diagnosis workflow to prevent infinite loops" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Skip duplicate diagnoses for the same workflow/branch/commit failure signature. | |
| EXISTING_ISSUE=$(scripts/create-deduped-workflow-failure-issue.sh \ | |
| --check-only \ | |
| "${{ github.repository }}" \ | |
| "$WORKFLOW_NAME" \ | |
| "$RUN_ID" \ | |
| "${{ github.event.workflow_run.html_url }}" \ | |
| "${{ github.event.workflow_run.head_branch }}" \ | |
| "${{ github.event.workflow_run.head_sha }}") | |
| if [ -n "$EXISTING_ISSUE" ]; then | |
| echo "Issue #$EXISTING_ISSUE already exists for this failure signature - skipping" | |
| echo "should_diagnose=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "should_diagnose=true" >> $GITHUB_OUTPUT | |
| # Build devcontainer for Codex | |
| build-devcontainer: | |
| runs-on: [self-hosted, homelab] | |
| needs: check-failure | |
| if: needs.check-failure.outputs.should_diagnose == 'true' | |
| permissions: | |
| contents: read | |
| packages: write | |
| steps: | |
| # SECURITY: Checkout main branch, NOT the PR branch | |
| - name: Checkout main branch (security measure) | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Build and push devcontainer | |
| uses: devcontainers/[email protected] | |
| continue-on-error: true | |
| with: | |
| imageName: ${{ env.DEVCONTAINER_IMAGE }} | |
| cacheFrom: ${{ env.DEVCONTAINER_IMAGE }} | |
| push: always | |
| # Run Codex to diagnose the failure | |
| diagnose-failure: | |
| needs: [check-failure, build-devcontainer] | |
| if: needs.check-failure.outputs.should_diagnose == 'true' | |
| runs-on: [self-hosted, homelab] | |
| permissions: | |
| contents: read | |
| issues: write | |
| actions: read | |
| packages: read | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Diagnose failure with Codex | |
| uses: ./.github/actions/run-devcontainer | |
| with: | |
| image: ${{ env.DEVCONTAINER_IMAGE }} | |
| pull: 'true' | |
| env: | | |
| OPENAI_API_KEY=fake-key | |
| GH_TOKEN=${{ secrets.WORKFLOW_PAT }} | |
| WORKFLOW_NAME=${{ needs.check-failure.outputs.workflow_name }} | |
| RUN_ID=${{ needs.check-failure.outputs.run_id }} | |
| RUN_URL=${{ needs.check-failure.outputs.run_url }} | |
| HEAD_BRANCH=${{ needs.check-failure.outputs.head_branch }} | |
| HEAD_SHA=${{ needs.check-failure.outputs.head_sha }} | |
| REPO=${{ github.repository }} | |
| run_cmd: | | |
| source .devcontainer/configure-codex.sh | |
| # Run Codex to diagnose the workflow failure | |
| timeout 30m codex exec \ | |
| --json \ | |
| --dangerously-bypass-approvals-and-sandbox \ | |
| "You are a GitHub Actions diagnostician. A workflow has failed and you need to determine | |
| if it's a GitHub Actions configuration problem or a normal test/code failure. | |
| WORKFLOW INFORMATION: | |
| - Workflow Name: ${WORKFLOW_NAME} | |
| - Run ID: ${RUN_ID} | |
| - Run URL: ${RUN_URL} | |
| - Branch: ${HEAD_BRANCH} | |
| - Commit SHA: ${HEAD_SHA} | |
| - Repository: ${REPO} | |
| YOUR TASK: | |
| 1. Fetch the workflow run logs using: gh run view ${RUN_ID} --log-failed | |
| 2. Read the workflow YAML file: .github/workflows/ (find the matching file for '${WORKFLOW_NAME}') | |
| 3. Analyze the failure to classify it as one of: | |
| a) TEST_FAILURE: Unit tests, integration tests, or code compilation failures | |
| b) CONFIG_FAILURE: Issues in configuration files (YAML validation, etc.) | |
| c) INFRASTRUCTURE_FAILURE: Transient external service availability issues | |
| d) ACTIONS_FAILURE: Problems with the GitHub Actions workflow definition itself | |
| INFRASTRUCTURE_FAILURE examples (do NOT create an issue for these): | |
| - GitHub Cache Service errors (502, 503, timeouts, EOF errors) | |
| - Container registry availability issues (MCR, GHCR, Docker Hub being temporarily unavailable) | |
| - Network connectivity issues to external services (EOF, connection reset, timeouts) | |
| - GitHub Actions Cache service returning 'Unicorn' error pages | |
| - Rate limiting from external services | |
| - DNS resolution failures for external registries | |
| - TLS/SSL handshake failures to external services | |
| - Any error message containing '502', '503', '504' from external services | |
| - Errors like 'failed to solve: Unavailable: error reading from server: EOF' | |
| ACTIONS_FAILURE examples (CREATE AN ISSUE for these): | |
| - Workflow YAML syntax errors | |
| - Missing or invalid action references (e.g., uses: unknown-action@v1) | |
| - Invalid workflow triggers or event configurations | |
| - Missing required secrets or environment variables (not test code) | |
| - Permission issues with GitHub tokens/actions | |
| - Docker build failures in workflow steps (not Dockerfile issues) | |
| - Job dependency issues | |
| - Concurrency/matrix configuration problems | |
| - Runner environment issues | |
| NOT ACTIONS_FAILURE (do NOT create an issue): | |
| - Script lint failures (shellcheck) | |
| - YAML validation failures (yamllint) | |
| - Documentation validation failures | |
| - Coverage below threshold | |
| - Application Dockerfile build failures | |
| - GitHub Pages build failures (Jekyll/Ruby errors) | |
| - Review coverage evaluation failures (post-merge analysis errors) | |
| DECISION LOGIC: | |
| - If the failure is ACTIONS_FAILURE: Create a GitHub issue | |
| - If the failure is TEST_FAILURE, CONFIG_FAILURE, or INFRASTRUCTURE_FAILURE: Do NOT create an issue | |
| (TEST_FAILURE and CONFIG_FAILURE are handled by the existing Codex Code Review workflow) | |
| (INFRASTRUCTURE_FAILURE issues are transient external problems that will self-resolve) | |
| IF YOU DETERMINE THIS IS AN ACTIONS_FAILURE: | |
| 1. Write the issue body below to /tmp/workflow-failure-issue.md | |
| 2. Create or reuse the canonical issue by running: | |
| scripts/create-deduped-workflow-failure-issue.sh \\ | |
| \"${REPO}\" \\ | |
| \"${WORKFLOW_NAME}\" \\ | |
| \"${RUN_ID}\" \\ | |
| \"${RUN_URL}\" \\ | |
| \"${HEAD_BRANCH}\" \\ | |
| \"${HEAD_SHA}\" \\ | |
| /tmp/workflow-failure-issue.md | |
| 3. If the script prints an existing issue number, do not create a second issue. | |
| Build /tmp/workflow-failure-issue.md with this exact structure: | |
| cat <<'ISSUE_BODY' > /tmp/workflow-failure-issue.md | |
| ## Workflow Failure Report | |
| A GitHub Actions configuration problem has been detected. | |
| ### Workflow Details | |
| - **Workflow**: ${WORKFLOW_NAME} | |
| - **Run ID**: [${RUN_ID}](${RUN_URL}) | |
| - **Branch**: ${HEAD_BRANCH} | |
| - **Commit**: ${HEAD_SHA} | |
| ### Failure Analysis | |
| [DESCRIBE THE ACTIONS CONFIGURATION PROBLEM HERE] | |
| ### Relevant Logs | |
| \\\`\\\`\\\` | |
| [PASTE KEY ERROR LOGS HERE] | |
| \\\`\\\`\\\` | |
| ### Suggested Fix | |
| [DESCRIBE HOW TO FIX THE ACTIONS CONFIGURATION] | |
| ### Files to Review | |
| - [LIST WORKFLOW YAML FILES THAT NEED CHANGES] | |
| --- | |
| Generated by Codex Workflow Diagnostician | |
| ISSUE_BODY | |
| IF THIS IS NOT AN ACTIONS_FAILURE, output: | |
| 'DIAGNOSIS: [TEST_FAILURE|CONFIG_FAILURE|INFRASTRUCTURE_FAILURE] - No issue created.' | |
| For TEST_FAILURE/CONFIG_FAILURE: 'This failure type is handled by the Codex Code Review workflow.' | |
| For INFRASTRUCTURE_FAILURE: 'This is a transient external service issue that will self-resolve. No action needed.' | |
| Be thorough in your analysis. Examine the actual error messages in the logs." < /dev/null 2>&1 | tee /tmp/diagnose-output.jsonl | |
| - name: Upload diagnosis output | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| continue-on-error: true | |
| with: | |
| name: workflow-diagnosis-output | |
| path: /tmp/diagnose-output.jsonl | |
| retention-days: 7 | |
| - name: Notify agent webhook | |
| if: always() | |
| continue-on-error: true | |
| run: | | |
| # Signal the OpenClaw agent that a workflow failure was diagnosed | |
| WEBHOOK_URL="${{ vars.AGENT_WEBHOOK_URL }}" | |
| if [ -n "$WEBHOOK_URL" ]; then | |
| curl -s --max-time 5 "$WEBHOOK_URL" >/dev/null 2>&1 || true | |
| fi |