touch job files, detect stale jobs / dead runners

ryan-williams · ryan-williams · commit 3d0fe72d22cd · 2025-09-18T14:41:53.000-04:00
diff --git a/.github/workflows/test-disk-full.yml b/.github/workflows/test-disk-full.yml
@@ -28,10 +28,10 @@ on:
         type: string
         default: 't3.medium'
       max_instance_lifetime:
-        description: 'Max instance lifetime in minutes (default: 20)'
+        description: 'Max instance lifetime in minutes (default: 15)'
         required: false
         type: string
-        default: '20'
+        default: '15'
 
 permissions:
   id-token: write
diff --git a/src/ec2_gha/scripts/runner-setup.sh b/src/ec2_gha/scripts/runner-setup.sh
@@ -269,6 +269,43 @@ if [ $RUNNER_PROCS -eq 0 ]; then
   fi
 fi
 
+# Check job files and update timestamps for active runners
+# This creates a heartbeat mechanism to detect stuck/failed job completion
+for job_file in $J/*.job; do
+  [ -f "$job_file" ] || continue
+  if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
+    # Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
+    runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
+
+    # Check if the runner process for this job is still alive
+    if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
+      # Runner is alive, touch the job file to update its timestamp
+      touch "$job_file" 2>/dev/null || true
+    else
+      # Runner process is dead but job file still exists
+      job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
+      log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
+      rm -f "$job_file"
+    fi
+  fi
+done
+
+# Now check for stale job files that couldn't be touched (e.g., disk full)
+# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
+# If they are, something is preventing the touch (likely disk full)
+STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3))  # 3x the poll interval
+for job_file in $J/*.job; do
+  [ -f "$job_file" ] || continue
+  if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
+    job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
+    if [ $job_age -gt $STALE_THRESHOLD ]; then
+      log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
+      log "Touch must be failing (disk full?) - removing stale job file"
+      rm -f "$job_file"
+    fi
+  fi
+done
+
 [ ! -f "$A" ] && touch "$A"
 L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
 N=$(date +%s)