4949# run forever
5050while true ; do
5151
52+ # Build list of Failed jobs to protect (retain for 1 day for debugging)
53+ # These jobs should NOT be deleted by the normal cleanup below
54+ declare -A protected_failed_jobs
55+ current_epoch=$( date +%s)
56+
57+ kubectl get jobs -n " ${namespace} " -o json 2> /dev/null | \
58+ jq -r ' .items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
59+ { grep -E " ^${AGENT_NAME} -[0-9]+(-[0-9]+)?" || true ; } | \
60+ while read -r job_name job_timestamp; do
61+ if [[ -n " $job_name " ]] && [[ -n " $job_timestamp " ]]; then
62+ # Calculate if job is less than 1 day old (should be protected)
63+ job_epoch=$( date -d " $job_timestamp " +%s 2> /dev/null)
64+ if [[ -n " $job_epoch " ]]; then
65+ job_plus_1day_epoch=$(( job_epoch + 86400 ))
66+ if [[ " $job_plus_1day_epoch " -ge " $current_epoch " ]]; then
67+ # Job is less than 1 day old - protect it
68+ echo " $job_name "
69+ fi
70+ fi
71+ fi
72+ done > /tmp/protected_failed_jobs_${AGENT_NAME} .txt
73+
74+ # Load protected jobs into associative array for fast lookup
75+ while read -r job_name; do
76+ if [[ -n " $job_name " ]]; then
77+ protected_failed_jobs[" $job_name " ]=1
78+ fi
79+ done < /tmp/protected_failed_jobs_${AGENT_NAME} .txt 2> /dev/null
80+
81+ protected_count=${# protected_failed_jobs[@]}
82+ if [[ " $protected_count " -gt 0 ]]; then
83+ echo " $( date -Iseconds) === Protecting ${protected_count} failed job(s) less than 1 day old === (AGENT_NAME: ${AGENT_NAME} )"
84+ fi
85+
5286 if [ $use_k8s_ttl_controller == false ] ; then
5387 # cleanup finished jobs (status 1/1)
5488 # Filter by AGENT_NAME and check 3rd column for "1/1" (completions)
5589 for job in $( kubectl get jobs -n " ${namespace} " --no-headers | { grep -E -e " ^${AGENT_NAME} -[0-9]+(-[0-9]+)?\\ s" || true ; } | awk ' $3 == "1/1" {print $1}' ) ; do
90+ # Skip if this job is a protected failed job
91+ if [[ -n " ${protected_failed_jobs[$job]:- } " ]]; then
92+ echo " === Skipping protected failed job: $job === (AGENT_NAME: ${AGENT_NAME} )"
93+ continue
94+ fi
5695 echo " === Job $job Completed (1/1) - deleting from get jobs === (AGENT_NAME: ${AGENT_NAME} )"
5796 kubectl delete job " $job " -n " ${namespace} "
5897 done
@@ -68,6 +107,11 @@ while true; do
68107 if [ -n " $job_prefix_from_pod " ]; then
69108 # Validate that the derived job_prefix_from_pod actually matches the expected format for this agent's jobs
70109 if [[ " ${job_prefix_from_pod} " =~ ^${AGENT_NAME} -[0-9]+ (-[0-9]+)? $ ]]; then
110+ # Skip if this job is a protected failed job
111+ if [[ -n " ${protected_failed_jobs[$job_prefix_from_pod]:- } " ]]; then
112+ echo " === Skipping protected failed job: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME} )"
113+ continue
114+ fi
71115 echo " === Deleting Job based on pod status: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME} )"
72116 kubectl delete job " $job_prefix_from_pod " -n " ${namespace} " --ignore-not-found=true
73117 else
@@ -81,6 +125,27 @@ while true; do
81125 done
82126 fi
83127
128+ # Cleanup Failed jobs that are older than 1 day (no longer need protection for debugging)
129+ kubectl get jobs -n " ${namespace} " -o json 2> /dev/null | \
130+ jq -r ' .items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
131+ { grep -E " ^${AGENT_NAME} -[0-9]+(-[0-9]+)?" || true ; } | \
132+ while read -r job_name job_timestamp; do
133+ if [[ -n " $job_name " ]] && [[ -n " $job_timestamp " ]]; then
134+ # Calculate if job is older than 1 day
135+ job_epoch=$( date -d " $job_timestamp " +%s 2> /dev/null)
136+ if [[ -n " $job_epoch " ]]; then
137+ job_plus_1day_epoch=$(( job_epoch + 86400 ))
138+ if [[ " $job_plus_1day_epoch " -lt " $current_epoch " ]]; then
139+ echo " $( date -Iseconds) === Deleting 1-day old failed job: $job_name === (AGENT_NAME: ${AGENT_NAME} )"
140+ kubectl delete job " $job_name " -n " ${namespace} " --ignore-not-found=true
141+ fi
142+ fi
143+ fi
144+ done
145+
146+ # Clean up temp file
147+ rm -f /tmp/protected_failed_jobs_${AGENT_NAME} .txt
148+
84149 # Get the current ensembles
85150 # Pass the cluster file to the ensemble_count.py script
86151 if [ ! -f " ${FDB_CLUSTER_FILE} " ]; then
0 commit comments