Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions k8s/agent-scaler/agent-scaler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,49 @@ fi
# run forever
while true; do

# Build list of Failed jobs to protect (retain for 1 day for debugging)
# These jobs should NOT be deleted by the normal cleanup below
declare -A protected_failed_jobs
current_epoch=$(date +%s)

kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
{ grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
while read -r job_name job_timestamp; do
if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
# Calculate if job is less than 1 day old (should be protected)
job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
if [[ -n "$job_epoch" ]]; then
job_plus_1day_epoch=$((job_epoch + 86400))
if [[ "$job_plus_1day_epoch" -ge "$current_epoch" ]]; then
# Job is less than 1 day old - protect it
echo "$job_name"
fi
fi
fi
done > /tmp/protected_failed_jobs_${AGENT_NAME}.txt

# Load protected jobs into associative array for fast lookup
while read -r job_name; do
if [[ -n "$job_name" ]]; then
protected_failed_jobs["$job_name"]=1
fi
done < /tmp/protected_failed_jobs_${AGENT_NAME}.txt 2>/dev/null

protected_count=${#protected_failed_jobs[@]}
if [[ "$protected_count" -gt 0 ]]; then
echo "$(date -Iseconds) === Protecting ${protected_count} failed job(s) less than 1 day old === (AGENT_NAME: ${AGENT_NAME})"
fi

if [ $use_k8s_ttl_controller == false ] ; then
# cleanup finished jobs (status 1/1)
# Filter by AGENT_NAME and check 3rd column for "1/1" (completions)
for job in $(kubectl get jobs -n "${namespace}" --no-headers | { grep -E -e "^${AGENT_NAME}-[0-9]+(-[0-9]+)?\\s" || true; } | awk '$3 == "1/1" {print $1}'); do
# Skip if this job is a protected failed job
if [[ -n "${protected_failed_jobs[$job]:-}" ]]; then
echo "=== Skipping protected failed job: $job === (AGENT_NAME: ${AGENT_NAME})"
continue
fi
echo "=== Job $job Completed (1/1) - deleting from get jobs === (AGENT_NAME: ${AGENT_NAME})"
kubectl delete job "$job" -n "${namespace}"
done
Expand All @@ -68,6 +107,11 @@ while true; do
if [ -n "$job_prefix_from_pod" ]; then
# Validate that the derived job_prefix_from_pod actually matches the expected format for this agent's jobs
if [[ "${job_prefix_from_pod}" =~ ^${AGENT_NAME}-[0-9]+(-[0-9]+)?$ ]]; then
# Skip if this job is a protected failed job
if [[ -n "${protected_failed_jobs[$job_prefix_from_pod]:-}" ]]; then
echo "=== Skipping protected failed job: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
continue
fi
echo "=== Deleting Job based on pod status: $job_prefix_from_pod === (AGENT_NAME: ${AGENT_NAME})"
kubectl delete job "$job_prefix_from_pod" -n "${namespace}" --ignore-not-found=true
else
Expand All @@ -81,6 +125,27 @@ while true; do
done
fi

# Cleanup Failed jobs that are older than 1 day (no longer need protection for debugging)
kubectl get jobs -n "${namespace}" -o json 2>/dev/null | \
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Failed" and .status=="True")) | .metadata.name + " " + .metadata.creationTimestamp' | \
{ grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?" || true; } | \
while read -r job_name job_timestamp; do
if [[ -n "$job_name" ]] && [[ -n "$job_timestamp" ]]; then
# Calculate if job is older than 1 day
job_epoch=$(date -d "$job_timestamp" +%s 2>/dev/null)
if [[ -n "$job_epoch" ]]; then
job_plus_1day_epoch=$((job_epoch + 86400))
if [[ "$job_plus_1day_epoch" -lt "$current_epoch" ]]; then
echo "$(date -Iseconds) === Deleting 1-day old failed job: $job_name === (AGENT_NAME: ${AGENT_NAME})"
kubectl delete job "$job_name" -n "${namespace}" --ignore-not-found=true
fi
fi
fi
done

# Clean up temp file
rm -f /tmp/protected_failed_jobs_${AGENT_NAME}.txt

# Get the current ensembles
# Pass the cluster file to the ensemble_count.py script
if [ ! -f "${FDB_CLUSTER_FILE}" ]; then
Expand Down