@@ -269,6 +269,43 @@ if [ $RUNNER_PROCS -eq 0 ]; then
269269 fi
270270fi
271271
272+ # Check job files and update timestamps for active runners
273+ # This creates a heartbeat mechanism to detect stuck/failed job completion
274+ for job_file in $J/*.job; do
275+ [ -f "$job_file" ] || continue
276+ if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
277+ # Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
278+ runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
279+
280+ # Check if the runner process for this job is still alive
281+ if pgrep -f "runner-${runner_num}/.*Runner.Listener" > /dev/null 2>&1; then
282+ # Runner is alive, touch the job file to update its timestamp
283+ touch "$job_file" 2>/dev/null || true
284+ else
285+ # Runner process is dead but job file still exists
286+ job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
287+ log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
288+ rm -f "$job_file"
289+ fi
290+ fi
291+ done
292+
293+ # Now check for stale job files that couldn't be touched (e.g., disk full)
294+ # With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
295+ # If they are, something is preventing the touch (likely disk full)
296+ STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
297+ for job_file in $J/*.job; do
298+ [ -f "$job_file" ] || continue
299+ if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
300+ job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
301+ if [ $job_age -gt $STALE_THRESHOLD ]; then
302+ log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
303+ log "Touch must be failing (disk full?) - removing stale job file"
304+ rm -f "$job_file"
305+ fi
306+ fi
307+ done
308+
272309[ ! -f "$A" ] && touch "$A"
273310L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
274311N=$(date +%s)
0 commit comments