@@ -269,6 +269,41 @@ if [ $RUNNER_PROCS -eq 0 ]; then
269269 fi
270270fi
271271
272+ # Check job files and update timestamps for active runners
273+ # This creates a heartbeat mechanism to detect stuck/failed job completion
274+ for job_file in $J/*.job 2>/dev/null; do
275+ if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
276+ # Extract runner number from job file name
277+ runner_num=$(basename "$job_file" | grep -o '[0-9]*$' | cut -d. -f1)
278+
279+ # Check if the runner process for this job is still alive
280+ if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
281+ # Runner is alive, touch the job file to update its timestamp
282+ touch "$job_file" 2>/dev/null || true
283+ else
284+ # Runner process is dead but job file still exists
285+ job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
286+ log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
287+ rm -f "$job_file"
288+ fi
289+ fi
290+ done
291+
292+ # Now check for stale job files that couldn't be touched (e.g., disk full)
293+ # With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
294+ # If they are, something is preventing the touch (likely disk full)
295+ STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
296+ for job_file in $J/*.job 2>/dev/null; do
297+ if [ -f "$job_file" ] && grep -q '"status":"running"' "$job_file"; then
298+ job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
299+ if [ $job_age -gt $STALE_THRESHOLD ]; then
300+ log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
301+ log "Touch must be failing (disk full?) - removing stale job file"
302+ rm -f "$job_file"
303+ fi
304+ fi
305+ done
306+
272307[ ! -f "$A" ] && touch "$A"
273308L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
274309N=$(date +%s)
0 commit comments