hide-my-list/.github/workflows/codex-diagnose-workflow-failure.yml at main · NickBorgersProbably/hide-my-list · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
---
name: Codex Diagnose Workflow Failure

# Triggers when any workflow run completes with failure
# Analyzes if it's a GitHub Actions configuration problem vs a test/code failure
# or transient infrastructure issue (external service availability problems)
# Creates an issue only for Actions configuration problems
# (not test failures or transient infrastructure issues)

# SECURITY NOTE: This workflow has lower prompt injection risk because:
# - It only triggers on workflow_run events (not user-controlled events)
# - It analyzes workflow logs and YAML files, not user-provided content
# - The Codex prompt is constructed from workflow metadata, not PR/issue content
# - It only has read permissions for contents/actions, write only for issues
#
# However, malicious workflow log output could theoretically influence Codex's analysis.
# This is an acceptable risk since the worst case is creating an unnecessary issue.

on:
  workflow_run:
    workflows:
      - "PR Tests"
      - "Codex Agent"
      - "Codex Code Review"
      - "Deploy to GitHub Pages"
      - "Review Coverage Evaluator"
    types:
      - completed

# Prevent duplicate diagnoses for the same workflow/branch/commit failure.
concurrency:
  group: diagnose-failure-${{ github.event.workflow_run.name }}-${{ github.event.workflow_run.head_branch || 'no-branch' }}-${{ github.event.workflow_run.head_sha || github.event.workflow_run.id }}
  cancel-in-progress: false

env:
  DEVCONTAINER_IMAGE: ghcr.io/nickborgersprobably/hide-my-list-devcontainer

jobs:
  # Only proceed if the workflow failed
  check-failure:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      issues: read
    outputs:
      should_diagnose: ${{ steps.check.outputs.should_diagnose }}
      workflow_name: ${{ github.event.workflow_run.name }}
      run_id: ${{ github.event.workflow_run.id }}
      run_url: ${{ github.event.workflow_run.html_url }}
      head_branch: ${{ github.event.workflow_run.head_branch }}
      head_sha: ${{ github.event.workflow_run.head_sha }}
      conclusion: ${{ github.event.workflow_run.conclusion }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          ref: main

      - name: Check if diagnosis is needed
        id: check
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          CONCLUSION="${{ github.event.workflow_run.conclusion }}"
          WORKFLOW_NAME="${{ github.event.workflow_run.name }}"
          RUN_ID="${{ github.event.workflow_run.id }}"

          echo "Workflow: $WORKFLOW_NAME"
          echo "Conclusion: $CONCLUSION"
          echo "Run ID: $RUN_ID"

          # Only diagnose failures (not success, cancelled, or skipped)
          if [ "$CONCLUSION" != "failure" ]; then
            echo "Workflow did not fail (conclusion: $CONCLUSION) - skipping diagnosis"
            echo "should_diagnose=false" >> $GITHUB_OUTPUT
            exit 0
          fi

          # Skip diagnosis for THIS workflow only to avoid infinite loops
          # Other Codex workflows (Codex Agent, Codex Code Review) should be diagnosed
          if [[ "$WORKFLOW_NAME" == "Codex Diagnose Workflow Failure" ]]; then
            echo "Skipping diagnosis of diagnosis workflow to prevent infinite loops"
            echo "should_diagnose=false" >> $GITHUB_OUTPUT
            exit 0
          fi

          # Skip duplicate diagnoses for the same workflow/branch/commit failure signature.
          EXISTING_ISSUE=$(scripts/create-deduped-workflow-failure-issue.sh \
            --check-only \
            "${{ github.repository }}" \
            "$WORKFLOW_NAME" \
            "$RUN_ID" \
            "${{ github.event.workflow_run.html_url }}" \
            "${{ github.event.workflow_run.head_branch }}" \
            "${{ github.event.workflow_run.head_sha }}")

          if [ -n "$EXISTING_ISSUE" ]; then
            echo "Issue #$EXISTING_ISSUE already exists for this failure signature - skipping"
            echo "should_diagnose=false" >> $GITHUB_OUTPUT
            exit 0
          fi

          echo "should_diagnose=true" >> $GITHUB_OUTPUT

  # Build devcontainer for Codex
  build-devcontainer:
    runs-on: [self-hosted, homelab]
    needs: check-failure
    if: needs.check-failure.outputs.should_diagnose == 'true'
    permissions:
      contents: read
      packages: write

    steps:
      # SECURITY: Checkout main branch, NOT the PR branch
      - name: Checkout main branch (security measure)
        uses: actions/checkout@v4
        with:
          ref: main

      - name: Log in to GHCR
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Build and push devcontainer
        uses: devcontainers/ci@v0.3
        continue-on-error: true
        with:
          imageName: ${{ env.DEVCONTAINER_IMAGE }}
          cacheFrom: ${{ env.DEVCONTAINER_IMAGE }}
          push: always

  # Run Codex to diagnose the failure
  diagnose-failure:
    needs: [check-failure, build-devcontainer]
    if: needs.check-failure.outputs.should_diagnose == 'true'
    runs-on: [self-hosted, homelab]
    permissions:
      contents: read
      issues: write
      actions: read
      packages: read

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Log in to GHCR
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Diagnose failure with Codex
        uses: ./.github/actions/run-devcontainer
        with:
          image: ${{ env.DEVCONTAINER_IMAGE }}
          pull: 'true'
          env: |
            OPENAI_API_KEY=fake-key
            GH_TOKEN=${{ secrets.WORKFLOW_PAT }}
            WORKFLOW_NAME=${{ needs.check-failure.outputs.workflow_name }}
            RUN_ID=${{ needs.check-failure.outputs.run_id }}
            RUN_URL=${{ needs.check-failure.outputs.run_url }}
            HEAD_BRANCH=${{ needs.check-failure.outputs.head_branch }}
            HEAD_SHA=${{ needs.check-failure.outputs.head_sha }}
            REPO=${{ github.repository }}
          run_cmd: |
            source .devcontainer/configure-codex.sh

            # Run Codex to diagnose the workflow failure
            timeout 30m codex exec \
              --json \
              --dangerously-bypass-approvals-and-sandbox \
              "You are a GitHub Actions diagnostician. A workflow has failed and you need to determine
            if it's a GitHub Actions configuration problem or a normal test/code failure.

            WORKFLOW INFORMATION:
            - Workflow Name: ${WORKFLOW_NAME}
            - Run ID: ${RUN_ID}
            - Run URL: ${RUN_URL}
            - Branch: ${HEAD_BRANCH}
            - Commit SHA: ${HEAD_SHA}
            - Repository: ${REPO}

            YOUR TASK:
            1. Fetch the workflow run logs using: gh run view ${RUN_ID} --log-failed
            2. Read the workflow YAML file: .github/workflows/ (find the matching file for '${WORKFLOW_NAME}')
            3. Analyze the failure to classify it as one of:
               a) TEST_FAILURE: Unit tests, integration tests, or code compilation failures
               b) CONFIG_FAILURE: Issues in configuration files (YAML validation, etc.)
               c) INFRASTRUCTURE_FAILURE: Transient external service availability issues
               d) ACTIONS_FAILURE: Problems with the GitHub Actions workflow definition itself

            INFRASTRUCTURE_FAILURE examples (do NOT create an issue for these):
            - GitHub Cache Service errors (502, 503, timeouts, EOF errors)
            - Container registry availability issues (MCR, GHCR, Docker Hub being temporarily unavailable)
            - Network connectivity issues to external services (EOF, connection reset, timeouts)
            - GitHub Actions Cache service returning 'Unicorn' error pages
            - Rate limiting from external services
            - DNS resolution failures for external registries
            - TLS/SSL handshake failures to external services
            - Any error message containing '502', '503', '504' from external services
            - Errors like 'failed to solve: Unavailable: error reading from server: EOF'

            ACTIONS_FAILURE examples (CREATE AN ISSUE for these):
            - Workflow YAML syntax errors
            - Missing or invalid action references (e.g., uses: unknown-action@v1)
            - Invalid workflow triggers or event configurations
            - Missing required secrets or environment variables (not test code)
            - Permission issues with GitHub tokens/actions
            - Docker build failures in workflow steps (not Dockerfile issues)
            - Job dependency issues
            - Concurrency/matrix configuration problems
            - Runner environment issues

            NOT ACTIONS_FAILURE (do NOT create an issue):
            - Script lint failures (shellcheck)
            - YAML validation failures (yamllint)
            - Documentation validation failures
            - Coverage below threshold
            - Application Dockerfile build failures
            - GitHub Pages build failures (Jekyll/Ruby errors)
            - Review coverage evaluation failures (post-merge analysis errors)

            DECISION LOGIC:
            - If the failure is ACTIONS_FAILURE: Create a GitHub issue
            - If the failure is TEST_FAILURE, CONFIG_FAILURE, or INFRASTRUCTURE_FAILURE: Do NOT create an issue
              (TEST_FAILURE and CONFIG_FAILURE are handled by the existing Codex Code Review workflow)
              (INFRASTRUCTURE_FAILURE issues are transient external problems that will self-resolve)

            IF YOU DETERMINE THIS IS AN ACTIONS_FAILURE:
            1. Write the issue body below to /tmp/workflow-failure-issue.md
            2. Create or reuse the canonical issue by running:
               scripts/create-deduped-workflow-failure-issue.sh \\
                 \"${REPO}\" \\
                 \"${WORKFLOW_NAME}\" \\
                 \"${RUN_ID}\" \\
                 \"${RUN_URL}\" \\
                 \"${HEAD_BRANCH}\" \\
                 \"${HEAD_SHA}\" \\
                 /tmp/workflow-failure-issue.md
            3. If the script prints an existing issue number, do not create a second issue.

            Build /tmp/workflow-failure-issue.md with this exact structure:
            cat <<'ISSUE_BODY' > /tmp/workflow-failure-issue.md
            ## Workflow Failure Report

            A GitHub Actions configuration problem has been detected.

            ### Workflow Details
            - **Workflow**: ${WORKFLOW_NAME}
            - **Run ID**: [${RUN_ID}](${RUN_URL})
            - **Branch**: ${HEAD_BRANCH}
            - **Commit**: ${HEAD_SHA}

            ### Failure Analysis

            [DESCRIBE THE ACTIONS CONFIGURATION PROBLEM HERE]

            ### Relevant Logs

            \\\`\\\`\\\`
            [PASTE KEY ERROR LOGS HERE]
            \\\`\\\`\\\`

            ### Suggested Fix

            [DESCRIBE HOW TO FIX THE ACTIONS CONFIGURATION]

            ### Files to Review
            - [LIST WORKFLOW YAML FILES THAT NEED CHANGES]

            ---
            Generated by Codex Workflow Diagnostician
            ISSUE_BODY

            IF THIS IS NOT AN ACTIONS_FAILURE, output:
            'DIAGNOSIS: [TEST_FAILURE|CONFIG_FAILURE|INFRASTRUCTURE_FAILURE] - No issue created.'
            For TEST_FAILURE/CONFIG_FAILURE: 'This failure type is handled by the Codex Code Review workflow.'
            For INFRASTRUCTURE_FAILURE: 'This is a transient external service issue that will self-resolve. No action needed.'

            Be thorough in your analysis. Examine the actual error messages in the logs." < /dev/null 2>&1 | tee /tmp/diagnose-output.jsonl

      - name: Upload diagnosis output
        uses: actions/upload-artifact@v4
        if: always()
        continue-on-error: true
        with:
          name: workflow-diagnosis-output
          path: /tmp/diagnose-output.jsonl
          retention-days: 7

      - name: Notify agent webhook
        if: always()
        continue-on-error: true
        run: |
          # Signal the OpenClaw agent that a workflow failure was diagnosed
          WEBHOOK_URL="${{ vars.AGENT_WEBHOOK_URL }}"
          if [ -n "$WEBHOOK_URL" ]; then
            curl -s --max-time 5 "$WEBHOOK_URL" >/dev/null 2>&1 || true
          fi