Skip to content

Commit 3d0fe72

Browse files
committed
touch job files, detect stale jobs / dead runners
1 parent c54922e commit 3d0fe72

File tree

2 files changed

+39
-2
lines changed

2 files changed

+39
-2
lines changed

.github/workflows/test-disk-full.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ on:
2828
type: string
2929
default: 't3.medium'
3030
max_instance_lifetime:
31-
description: 'Max instance lifetime in minutes (default: 20)'
31+
description: 'Max instance lifetime in minutes (default: 15)'
3232
required: false
3333
type: string
34-
default: '20'
34+
default: '15'
3535

3636
permissions:
3737
id-token: write

src/ec2_gha/scripts/runner-setup.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,43 @@ if [ $RUNNER_PROCS -eq 0 ]; then
269269
fi
270270
fi
271271
272+
# Check job files and update timestamps for active runners
273+
# This creates a heartbeat mechanism to detect stuck/failed job completion
274+
for job_file in $J/*.job; do
275+
[ -f "$job_file" ] || continue
276+
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
277+
# Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
278+
runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
279+
280+
# Check if the runner process for this job is still alive
281+
if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
282+
# Runner is alive, touch the job file to update its timestamp
283+
touch "$job_file" 2>/dev/null || true
284+
else
285+
# Runner process is dead but job file still exists
286+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
287+
log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
288+
rm -f "$job_file"
289+
fi
290+
fi
291+
done
292+
293+
# Now check for stale job files that couldn't be touched (e.g., disk full)
294+
# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
295+
# If they are, something is preventing the touch (likely disk full)
296+
STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
297+
for job_file in $J/*.job; do
298+
[ -f "$job_file" ] || continue
299+
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
300+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
301+
if [ $job_age -gt $STALE_THRESHOLD ]; then
302+
log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
303+
log "Touch must be failing (disk full?) - removing stale job file"
304+
rm -f "$job_file"
305+
fi
306+
fi
307+
done
308+
272309
[ ! -f "$A" ] && touch "$A"
273310
L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
274311
N=$(date +%s)

0 commit comments

Comments
 (0)