Skip to content

Commit 58084c6

Browse files
author
michael stack
committed
Keep failed jobs around for a day
1 parent 8d66062 commit 58084c6

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

k8s/agent-scaler/agent-scaler.sh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,49 @@ fi
4949
# run forever
5050
while true; do
5151

52+
# Build list of Failed jobs to protect (retain for 1 day for debugging)
53+
# These jobs should NOT be deleted by the normal cleanup below
54+
declare -A protected_failed_jobs
55+
current_epoch=$(date +%s)
56+
57+
kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
58+
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
59+
{ grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
60+
while read -r job_name job_timestamp; do
61+
if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
62+
# Calculate if job is less than 1 day old (should be protected)
63+
job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
64+
if [[ -n "$job_epoch" ]]; then
65+
job_plus_1day_epoch=$((job_epoch + 86400))
66+
if [[ "$job_plus_1day_epoch" -ge "$current_epoch" ]]; then
67+
# Job is less than 1 day old - protect it
68+
echo "$job_name"
69+
fi
70+
fi
71+
fi
72+
done > /tmp/protected_failed_jobs_${AGENT_NAME}.txt
73+
74+
# Load protected jobs into associative array for fast lookup
75+
while read -r job_name; do
76+
if [[ -n "$job_name" ]]; then
77+
protected_failed_jobs["$job_name"]=1
78+
fi
79+
done < /tmp/protected_failed_jobs_${AGENT_NAME}.txt 2>/dev/null
80+
81+
protected_count=${#protected_failed_jobs[@]}
82+
if [[ "$protected_count" -gt 0 ]]; then
83+
echo "$(date -Iseconds) === Protecting ${protected_count} failed job(s) less than 1 day old === (AGENT_NAME: ${AGENT_NAME})"
84+
fi
85+
5286
if [ $use_k8s_ttl_controller == false ] ; then
5387
# cleanup finished jobs (status 1/1)
5488
# Filter by AGENT_NAME and check 3rd column for "1/1" (completions)
5589
for job in $(kubectl get jobs -n "${namespace}" --no-headers | { grep -E -e "^${AGENT_NAME}-[0-9]+(-[0-9]+)?\\s" || true; } | awk '$3 == "1/1" {print $1}'); do
90+
# Skip if this job is a protected failed job
91+
if [[ -n "${protected_failed_jobs[$job]:-}" ]]; then
92+
echo "=== Skipping protected failed job: $job === (AGENT_NAME: ${AGENT_NAME})"
93+
continue
94+
fi
5695
echo "=== Job $job Completed (1/1) - deleting from get jobs === (AGENT_NAME: ${AGENT_NAME})"
5796
kubectl delete job "$job" -n "${namespace}"
5897
done
@@ -68,6 +107,11 @@ while true; do
68107
if [ -n "$job_prefix_from_pod" ]; then
69108
# Validate that the derived job_prefix_from_pod actually matches the expected format for this agent's jobs
70109
if [[ "${job_prefix_from_pod}" =~ ^${AGENT_NAME}-[0-9]+(-[0-9]+)?$ ]]; then
110+
# Skip if this job is a protected failed job
111+
if [[ -n "${protected_failed_jobs[$job_prefix_from_pod]:-}" ]]; then
112+
echo "=== Skipping protected failed job: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
113+
continue
114+
fi
71115
echo "=== Deleting Job based on pod status: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
72116
kubectl delete job "$job_prefix_from_pod" -n "${namespace}" --ignore-not-found=true
73117
else
@@ -81,6 +125,27 @@ while true; do
81125
done
82126
fi
83127

128+
# Cleanup Failed jobs that are older than 1 day (no longer need protection for debugging)
129+
kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
130+
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
131+
{ grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
132+
while read -r job_name job_timestamp; do
133+
if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
134+
# Calculate if job is older than 1 day
135+
job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
136+
if [[ -n "$job_epoch" ]]; then
137+
job_plus_1day_epoch=$((job_epoch + 86400))
138+
if [[ "$job_plus_1day_epoch" -lt "$current_epoch" ]]; then
139+
echo "$(date -Iseconds) === Deleting 1-day old failed job: $job_name === (AGENT_NAME: ${AGENT_NAME})"
140+
kubectl delete job "$job_name" -n "${namespace}" --ignore-not-found=true
141+
fi
142+
fi
143+
fi
144+
done
145+
146+
# Clean up temp file
147+
rm -f /tmp/protected_failed_jobs_${AGENT_NAME}.txt
148+
84149
# Get the current ensembles
85150
# Pass the cluster file to the ensemble_count.py script
86151
if [ ! -f "${FDB_CLUSTER_FILE}" ]; then

0 commit comments

Comments
 (0)