Skip to content

Commit 7546377

Browse files
committed
touch job files, detect stale jobs / dead runners
1 parent c54922e commit 7546377

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

src/ec2_gha/scripts/runner-setup.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,41 @@ if [ $RUNNER_PROCS -eq 0 ]; then
269269
fi
270270
fi
271271
272+
# Check job files and update timestamps for active runners
273+
# This creates a heartbeat mechanism to detect stuck/failed job completion
274+
for job_file in $J/*.job 2>/dev/null; do
275+
if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
276+
# Extract runner number from job file name
277+
runner_num=$(basename "$job_file" | grep -o '[0-9]*$' | cut -d. -f1)
278+
279+
# Check if the runner process for this job is still alive
280+
if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
281+
# Runner is alive, touch the job file to update its timestamp
282+
touch "$job_file" 2>/dev/null || true
283+
else
284+
# Runner process is dead but job file still exists
285+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
286+
log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
287+
rm -f "$job_file"
288+
fi
289+
fi
290+
done
291+
292+
# Now check for stale job files that couldn't be touched (e.g., disk full)
293+
# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
294+
# If they are, something is preventing the touch (likely disk full)
295+
STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
296+
for job_file in $J/*.job 2>/dev/null; do
297+
if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
298+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
299+
if [ $job_age -gt $STALE_THRESHOLD ]; then
300+
log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
301+
log "Touch must be failing (disk full?) - removing stale job file"
302+
rm -f "$job_file"
303+
fi
304+
fi
305+
done
306+
272307
[ ! -f "$A" ] && touch "$A"
273308
L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
274309
N=$(date +%s)

0 commit comments

Comments
 (0)