forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 0
299 lines (256 loc) · 12 KB
/
Copy pathretry_infra_failures.yml
File metadata and controls
299 lines (256 loc) · 12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
name: Retry Infrastructure Failures
on:
schedule:
- cron: '43 * * * *'
workflow_dispatch:
permissions:
actions: write
pull-requests: write
jobs:
retry-failed:
runs-on: ubuntu-latest
steps:
- name: Retry PR runs failed due to infrastructure errors
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
set -euo pipefail
MAX_RERUNS=10
rerun_count=0
cutoff=$(date -u -d '6 hours ago' +%Y-%m-%dT%H:%M:%SZ)
echo "Looking for failed PR workflow runs since $cutoff..."
echo ""
# Get recent failed runs for the PR workflow that have not been retried yet
run_ids=$(gh run list \
--repo "$GH_REPO" \
--workflow pull_request.yml \
--status failure \
--limit 50 \
--json databaseId,attempt,createdAt \
--jq ".[] | select(.attempt == 1 and .createdAt >= \"$cutoff\") | .databaseId")
if [ -z "$run_ids" ]; then
echo "No recent failed runs found."
exit 0
fi
for run_id in $run_ids; do
if [ "$rerun_count" -ge "$MAX_RERUNS" ]; then
echo "Reached maximum of $MAX_RERUNS reruns, stopping."
break
fi
run_url="https://github.com/$GH_REPO/actions/runs/$run_id"
echo "Checking run $run_url ..."
should_rerun=false
# Fetch all job data once (reused for multiple checks below)
jobs_raw=$(gh api "repos/$GH_REPO/actions/runs/$run_id/jobs?per_page=100" --paginate)
# Collect per-job verdicts across all pages (runs can have >100 jobs).
# Each failed job emits "true" (infrastructure) or "false" (real failure).
# A job is considered an infrastructure failure if:
# - it is the "Config Workflow" or "Finish Workflow" job (pipeline plumbing,
# not actual tests — always treated as infrastructure failure), or
# - it never reached its main "Run" step (failed during checkout/setup), or
# - the "Run" step was skipped, or
# - the "Run" step failed almost immediately (under 2 minutes), indicating
# a setup/download issue (e.g. missing S3 credentials) rather than a real
# test failure
verdicts=$(echo "$jobs_raw" | jq -r '
.jobs[] | select(.conclusion == "failure") |
if .name == "Config Workflow" or .name == "Finish Workflow" then true
else
[.steps[] | select(.name == "Run")] |
if length == 0 then true
elif .[0].conclusion == "skipped" then true
elif .[0].conclusion == "failure" then
((.[0].completed_at | fromdateiso8601) -
(.[0].started_at | fromdateiso8601)) < 120
else false
end
end
')
# Infrastructure failure = at least one failed job, and all of them are infra
if [ -z "$verdicts" ]; then
:
elif echo "$verdicts" | grep -q "false"; then
:
else
should_rerun=true
echo " Infrastructure failure detected (job-level heuristic)."
fi
# Fetch run metadata once for all checks below
run_data=$(gh api "repos/$GH_REPO/actions/runs/$run_id" \
--jq '{pr: .pull_requests[0].number, sha: .head_sha}')
pr_number=$(echo "$run_data" | jq -r '.pr // empty')
run_sha=$(echo "$run_data" | jq -r '.sha')
# If the job-level heuristic didn't trigger, check Praktika result JSONs
# on S3 for infrastructure-related failures:
# - "Checkout Submodules" failures (transient DNS/git issues)
# - Results with the "infra" label (e.g. Docker image pull failures)
if [ "$should_rerun" = "false" ] && [ -n "$verdicts" ] && [ -n "$pr_number" ] && [ -n "$run_sha" ]; then
# Get names of failed jobs (excluding pipeline plumbing)
failed_job_names=$(echo "$jobs_raw" | jq -r '
.jobs[] | select(.conclusion == "failure") |
select(.name != "Config Workflow" and .name != "Finish Workflow") |
.name
')
all_infra_failures=true
while IFS= read -r job_name; do
[ -z "$job_name" ] && continue
# Normalize job name to match Praktika result file naming:
# lowercase, replace non-alphanumeric chars with underscore, collapse runs
normalized=$(echo "$job_name" | tr '[:upper:]' '[:lower:]' | \
sed 's/[^a-z0-9_]/_/g; s/__*/_/g')
result_url="https://s3.amazonaws.com/clickhouse-test-reports/PRs/${pr_number}/${run_sha}/result_${normalized}.json"
result_json=$(curl -sf --compressed "$result_url" 2>/dev/null || true)
if [ -z "$result_json" ]; then
all_infra_failures=false
break
fi
# Check if the top-level result has the "infra" label
has_infra_label=$(echo "$result_json" | jq -r '
(.ext.labels // []) | any(. == "infra")
')
if [ "$has_infra_label" = "true" ]; then
continue
fi
# Check: all failed sub-results must be "Checkout Submodules"
has_non_checkout_failure=$(echo "$result_json" | jq -r '
[.results[] | select(.status == "failure" or .status == "error") |
.name] | map(select(. != "Checkout Submodules")) | length > 0
')
if [ "$has_non_checkout_failure" = "true" ]; then
all_infra_failures=false
break
fi
done <<< "$failed_job_names"
if [ "$all_infra_failures" = "true" ] && [ -n "$failed_job_names" ]; then
should_rerun=true
echo " Infrastructure failure detected (Praktika results indicate infra issue)."
fi
fi
# Check if "Config Workflow" failed in its "Run" step (e.g. due to
# pr_labels_and_category.py rejecting the changelog category). If the PR
# description was edited after the failure (same HEAD commit, so no new
# workflow run was triggered), re-run to pick up the fix.
if [ "$should_rerun" = "false" ]; then
config_failed_at=$(echo "$jobs_raw" | jq -r '
[.jobs[]
| select(.name == "Config Workflow" and .conclusion == "failure")
| .steps[] | select(.name == "Run" and .conclusion == "failure")
| .completed_at
] | first // empty
')
if [ -n "$config_failed_at" ] && [ -n "$pr_number" ]; then
pr_data=$(gh api "repos/$GH_REPO/pulls/$pr_number" \
--jq '{sha: .head.sha, updated: .updated_at}')
pr_sha=$(echo "$pr_data" | jq -r '.sha')
pr_updated=$(echo "$pr_data" | jq -r '.updated')
if [ "$run_sha" = "$pr_sha" ] && [[ "$pr_updated" > "$config_failed_at" ]]; then
should_rerun=true
echo " Config Workflow failed but PR #$pr_number was updated after — rerunning."
fi
fi
fi
if [ "$should_rerun" = "true" ]; then
if gh run rerun "$run_id" --repo "$GH_REPO"; then
rerun_count=$((rerun_count + 1))
echo " Triggered rerun: $run_url/attempts/2"
else
echo " Failed to trigger rerun (may already be rerunning)"
fi
else
echo " Not an infrastructure failure, skipping."
fi
echo ""
done
echo "Done. Triggered $rerun_count rerun(s)."
fix-stuck-auto-merge:
runs-on: ubuntu-latest
steps:
- name: Fix PRs stuck with auto-merge enabled but not queued
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
set -euo pipefail
echo "Looking for PRs with auto-merge enabled but not queued..."
echo ""
now=$(date +%s)
cutoff_seconds=3600 # 60 minutes
# Get open PRs with auto-merge enabled that have mergeStateStatus CLEAN
# or UNKNOWN (checks passed but not queued into merge queue).
# UNKNOWN can happen when GitHub's merge status evaluation gets stuck.
# We use GraphQL pagination (20 PRs per page) to avoid 502 errors
# that occur when fetching too many PRs in a single request.
owner="${GH_REPO%%/*}"
repo_name="${GH_REPO##*/}"
cat > /tmp/query.graphql << 'QUERYEOF'
query($owner: String!, $repo: String!, $cursor: String) {
repository(owner: $owner, name: $repo) {
pullRequests(states: OPEN, first: 20, after: $cursor, orderBy: {field: UPDATED_AT, direction: DESC}) {
pageInfo { hasNextPage endCursor }
nodes {
number
isDraft
mergeStateStatus
autoMergeRequest { enabledAt }
}
}
}
}
QUERYEOF
stuck_prs='[]'
cursor_args=()
for i in $(seq 1 10); do
response=$(gh api graphql \
-f query="$(cat /tmp/query.graphql)" \
-f owner="$owner" \
-f repo="$repo_name" \
"${cursor_args[@]}")
filtered=$(echo "$response" | jq '[
.data.repository.pullRequests.nodes[] | select(
.autoMergeRequest != null and
(.mergeStateStatus == "CLEAN" or .mergeStateStatus == "UNKNOWN") and
.isDraft == false
) | {number, autoMergeRequest, mergeStateStatus, isDraft}
]')
stuck_prs=$(echo "$stuck_prs" "$filtered" | jq -s '.[0] + .[1]')
has_next=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.hasNextPage')
if [ "$has_next" != "true" ]; then
break
fi
cursor=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.endCursor')
cursor_args=(-f cursor="$cursor")
done
count=$(echo "$stuck_prs" | jq 'length')
if [ "$count" -eq 0 ]; then
echo "No stuck PRs found."
exit 0
fi
echo "Found $count candidate PR(s), checking age..."
echo ""
fixed=0
pr_numbers=$(echo "$stuck_prs" | jq -r '.[].number')
for pr_number in $pr_numbers; do
enabled_at=$(echo "$stuck_prs" | jq -r ".[] | select(.number == $pr_number) | .autoMergeRequest.enabledAt")
enabled_ts=$(date -d "$enabled_at" +%s 2>/dev/null || echo 0)
age_seconds=$((now - enabled_ts))
if [ "$age_seconds" -lt "$cutoff_seconds" ]; then
echo "PR #$pr_number: auto-merge enabled ${age_seconds}s ago (< ${cutoff_seconds}s), skipping."
continue
fi
age_minutes=$((age_seconds / 60))
echo "PR #$pr_number: stuck for ${age_minutes} minutes, retoggling auto-merge..."
if gh pr merge "$pr_number" --disable-auto --repo "$GH_REPO"; then
sleep 2
if gh pr merge "$pr_number" --auto --repo "$GH_REPO"; then
echo " OK: Auto-merge retoggled for PR #$pr_number"
fixed=$((fixed + 1))
else
echo " ERROR: Failed to re-enable auto-merge for PR #$pr_number"
fi
else
echo " ERROR: Failed to disable auto-merge for PR #$pr_number"
fi
echo ""
done
echo "Done. Fixed $fixed stuck PR(s)."