Keep failed jobs around for a day

michael stack · michael stack · commit 58084c64f595 · 2026-01-07T10:12:48.000-08:00
diff --git a/k8s/agent-scaler/agent-scaler.sh b/k8s/agent-scaler/agent-scaler.sh
@@ -49,10 +49,49 @@ fi
 # run forever
 while true; do
 
+    # Build list of Failed jobs to protect (retain for 1 day for debugging)
+    # These jobs should NOT be deleted by the normal cleanup below
+    declare -A protected_failed_jobs
+    current_epoch=$(date +%s)
+    
+    kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
+      jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
+      { grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
+      while read -r job_name job_timestamp; do
+          if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
+              # Calculate if job is less than 1 day old (should be protected)
+              job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
+              if [[ -n "$job_epoch" ]]; then
+                  job_plus_1day_epoch=$((job_epoch + 86400))
+                  if [[ "$job_plus_1day_epoch" -ge "$current_epoch" ]]; then
+                      # Job is less than 1 day old - protect it
+                      echo "$job_name"
+                  fi
+              fi
+          fi
+      done > /tmp/protected_failed_jobs_${AGENT_NAME}.txt
+    
+    # Load protected jobs into associative array for fast lookup
+    while read -r job_name; do
+        if [[ -n "$job_name" ]]; then
+            protected_failed_jobs["$job_name"]=1
+        fi
+    done < /tmp/protected_failed_jobs_${AGENT_NAME}.txt 2>/dev/null
+    
+    protected_count=${#protected_failed_jobs[@]}
+    if [[ "$protected_count" -gt 0 ]]; then
+        echo "$(date -Iseconds) === Protecting ${protected_count} failed job(s) less than 1 day old === (AGENT_NAME: ${AGENT_NAME})"
+    fi
+
     if [ $use_k8s_ttl_controller == false ] ; then
       # cleanup finished jobs (status 1/1)
       # Filter by AGENT_NAME and check 3rd column for "1/1" (completions)
       for job in $(kubectl get jobs -n "${namespace}" --no-headers | { grep -E -e "^${AGENT_NAME}-[0-9]+(-[0-9]+)?\\s" || true; } | awk '$3 == "1/1" {print $1}'); do
+          # Skip if this job is a protected failed job
+          if [[ -n "${protected_failed_jobs[$job]:-}" ]]; then
+              echo "=== Skipping protected failed job: $job === (AGENT_NAME: ${AGENT_NAME})"
+              continue
+          fi
           echo "=== Job $job Completed (1/1) - deleting from get jobs === (AGENT_NAME: ${AGENT_NAME})"
           kubectl delete job "$job" -n "${namespace}"
       done
@@ -68,6 +107,11 @@ while true; do
           if [ -n "$job_prefix_from_pod" ]; then
             # Validate that the derived job_prefix_from_pod actually matches the expected format for this agent's jobs
             if [[ "${job_prefix_from_pod}" =~ ^${AGENT_NAME}-[0-9]+(-[0-9]+)?$ ]]; then
+              # Skip if this job is a protected failed job
+              if [[ -n "${protected_failed_jobs[$job_prefix_from_pod]:-}" ]]; then
+                  echo "=== Skipping protected failed job: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
+                  continue
+              fi
               echo "=== Deleting Job based on pod status: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
               kubectl delete job "$job_prefix_from_pod" -n "${namespace}" --ignore-not-found=true
             else
@@ -81,6 +125,27 @@ while true; do
       done
     fi
 
+    # Cleanup Failed jobs that are older than 1 day (no longer need protection for debugging)
+    kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
+      jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
+      { grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
+      while read -r job_name job_timestamp; do
+          if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
+              # Calculate if job is older than 1 day
+              job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
+              if [[ -n "$job_epoch" ]]; then
+                  job_plus_1day_epoch=$((job_epoch + 86400))
+                  if [[ "$job_plus_1day_epoch" -lt "$current_epoch" ]]; then
+                      echo "$(date -Iseconds) === Deleting 1-day old failed job: $job_name === (AGENT_NAME: ${AGENT_NAME})"
+                      kubectl delete job "$job_name" -n "${namespace}" --ignore-not-found=true
+                  fi
+              fi
+          fi
+      done
+    
+    # Clean up temp file
+    rm -f /tmp/protected_failed_jobs_${AGENT_NAME}.txt
+
     # Get the current ensembles
     # Pass the cluster file to the ensemble_count.py script
     if [ ! -f "${FDB_CLUSTER_FILE}" ]; then