touch job files, detect stale jobs / dead runners

ryan-williams · ryan-williams · commit 7546377a0183 · 2025-09-18T14:07:48.000-04:00
diff --git a/src/ec2_gha/scripts/runner-setup.sh b/src/ec2_gha/scripts/runner-setup.sh
@@ -269,6 +269,41 @@ if [ $RUNNER_PROCS -eq 0 ]; then
   fi
 fi
 
+# Check job files and update timestamps for active runners
+# This creates a heartbeat mechanism to detect stuck/failed job completion
+for job_file in $J/*.job 2>/dev/null; do
+  if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
+    # Extract runner number from job file name
+    runner_num=$(basename "$job_file" | grep -o '[0-9]*$' | cut -d. -f1)
+
+    # Check if the runner process for this job is still alive
+    if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
+      # Runner is alive, touch the job file to update its timestamp
+      touch "$job_file" 2>/dev/null || true
+    else
+      # Runner process is dead but job file still exists
+      job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
+      log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
+      rm -f "$job_file"
+    fi
+  fi
+done
+
+# Now check for stale job files that couldn't be touched (e.g., disk full)
+# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
+# If they are, something is preventing the touch (likely disk full)
+STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3))  # 3x the poll interval
+for job_file in $J/*.job 2>/dev/null; do
+  if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
+    job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
+    if [ $job_age -gt $STALE_THRESHOLD ]; then
+      log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
+      log "Touch must be failing (disk full?) - removing stale job file"
+      rm -f "$job_file"
+    fi
+  fi
+done
+
 [ ! -f "$A" ] && touch "$A"
 L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
 N=$(date +%s)