Skip to content

Commit 6a39907

Browse files
committed
ci: improve spot termination detection for automatic reruns
- Check job metadata/annotations for operation was canceled errors - Treat failed log downloads as infrastructure failures - Fixes cases where spot termination happens too fast for monitor script
1 parent f08fd8e commit 6a39907

1 file changed

Lines changed: 39 additions & 6 deletions

File tree

.github/workflows/pr-test.yml

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,20 @@ jobs:
256256
exit 0
257257
fi
258258
for JOB_ID in $FAILED_JOBS; do
259-
# Download logs (may be ZIP or plain text depending on GitHub API)
259+
# First check job metadata for runner communication errors
260+
# This catches "The operation was canceled" which appears in annotations, not logs
261+
JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
262+
if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
263+
echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
264+
SPOT_TERMINATION=true
265+
break
266+
fi
267+
268+
# Try to download logs - if we can't, likely infrastructure failure
260269
if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
261-
continue
270+
echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
271+
SPOT_TERMINATION=true
272+
break
262273
fi
263274
# Try to unzip if it's a ZIP file, otherwise use as-is
264275
if file job_log.zip | grep -q "Zip archive"; then
@@ -424,9 +435,20 @@ jobs:
424435
exit 0
425436
fi
426437
for JOB_ID in $FAILED_JOBS; do
427-
# Download logs (may be ZIP or plain text depending on GitHub API)
438+
# First check job metadata for runner communication errors
439+
# This catches "The operation was canceled" which appears in annotations, not logs
440+
JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
441+
if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
442+
echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
443+
SPOT_TERMINATION=true
444+
break
445+
fi
446+
447+
# Try to download logs - if we can't, likely infrastructure failure
428448
if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
429-
continue
449+
echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
450+
SPOT_TERMINATION=true
451+
break
430452
fi
431453
# Try to unzip if it's a ZIP file, otherwise use as-is
432454
if file job_log.zip | grep -q "Zip archive"; then
@@ -576,9 +598,20 @@ jobs:
576598
exit 0
577599
fi
578600
for JOB_ID in $FAILED_JOBS; do
579-
# Download logs (may be ZIP or plain text depending on GitHub API)
601+
# First check job metadata for runner communication errors
602+
# This catches "The operation was canceled" which appears in annotations, not logs
603+
JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
604+
if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
605+
echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
606+
SPOT_TERMINATION=true
607+
break
608+
fi
609+
610+
# Try to download logs - if we can't, likely infrastructure failure
580611
if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
581-
continue
612+
echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
613+
SPOT_TERMINATION=true
614+
break
582615
fi
583616
# Try to unzip if it's a ZIP file, otherwise use as-is
584617
if file job_log.zip | grep -q "Zip archive"; then

0 commit comments

Comments
 (0)