FoundationDB · saintstack · May 30, 2025 · May 30, 2025
diff --git a/k8s/agent-scaler/agent-scaler.sh b/k8s/agent-scaler/agent-scaler.sh
@@ -1,5 +1,19 @@
 #!/usr/bin/env bash
 
+# This script acts as a Kubernetes agent scaler for Joshua test jobs.
+# Its primary responsibilities are:
+# 1. Periodically cleaning up completed or failed Joshua agent jobs specific
+#    to the AGENT_NAME it's configured for (e.g., joshua-agent, joshua-rhel9-agent).
+# 2. Monitoring the queue of pending test ensembles (via /tools/ensemble_count.py).
+# 3. Provisioning new Joshua agent jobs of its configured AGENT_NAME type if there
+#    is demand and the total number of active Joshua jobs (of any type) in the
+#    namespace is below the global MAX_JOBS limit.
+# It uses kubectl for all Kubernetes interactions and is configured via
+# environment variables such as BATCH_SIZE, MAX_JOBS, CHECK_DELAY, AGENT_NAME,
+# and AGENT_TAG (for the job template).
+#
+# This script is intended to work for joshua-agent and for joshua-rhel9-agent.
+
 batch_size=${BATCH_SIZE:-1}
 max_jobs=${MAX_JOBS:-10}
 check_delay=${CHECK_DELAY:-10}
@@ -10,6 +24,9 @@ restart_agents_on_boot=${RESTART_AGENTS_ON_BOOT:-false}
 
 namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)
 
+# Default AGENT_NAME to "joshua-agent" if not set or empty
+export AGENT_NAME=${AGENT_NAME:-"joshua-agent"}
+
 # if AGENT_TAG is not set through --build-arg,
 # use the default agent image and tag
 export AGENT_TAG=${AGENT_TAG:-"foundationdb/joshua-agent:latest"}
@@ -24,70 +41,99 @@ while true; do
 
     if [ $use_k8s_ttl_controller == false ] ; then
       # cleanup finished jobs (status 1/1)
-      for job in $(kubectl get jobs -n "${namespace}" --no-headers | grep -E -e "${AGENT_NAME}" | awk '{if ($2 == "Complete") { print $1; } }'); do
-          echo "=== Job $job Completed ==="
+      # Filter by AGENT_NAME and check 3rd column for "1/1" (completions)
+      for job in $(kubectl get jobs -n "${namespace}" --no-headers | { grep -E -e "^${AGENT_NAME}-[0-9]+(-[0-9]+)?\\s" || true; } | awk '$3 == "1/1" {print $1}'); do
+          echo "=== Job $job Completed (1/1) - deleting from get jobs === (AGENT_NAME: ${AGENT_NAME})"
           kubectl delete job "$job" -n "${namespace}"
       done
 
-      # cleanup failed jobs
+      # cleanup failed/completed jobs by looking at pods for the current AGENT_NAME
       # pod name is always prefixed with the job name
-      # e.g. "joshua-agent-XXXXXXXXXXXX-XX-yyyyy"
-      for job in $(kubectl get pods -n "${namespace}" --no-headers | grep -E -e "Completed" -e "Error" | cut -f 1-4 -d '-') ; do
-          echo "=== Deleting Job $job ==="
-          kubectl delete job "$job" -n "${namespace}"
+      # e.g. "joshua-agent-XXXXXXXXXXXXXX-XX-yyyyy" or "joshua-rhel9-agent-XXXXXXXXXXXXXX-XX-yyyyy"
+      # Filter pods by AGENT_NAME first, then by status, then extract job prefix
+      # The number of fields to cut for the job prefix depends on the number of hyphens in AGENT_NAME itself, plus one for the timestamp part.
+      num_hyphen_fields_in_agent_name=$(echo "${AGENT_NAME}" | awk -F'-' '{print NF}')
+      job_prefix_fields=$((num_hyphen_fields_in_agent_name + 1))
+      for job_prefix in $(kubectl get pods -n "${namespace}" --no-headers | { grep -E "^${AGENT_NAME}-[0-9]+(-[0-9]+)?-" || true; } | { grep -E -e "Completed" -e "Error" || true; } | cut -f 1-${job_prefix_fields} -d '-'); do
+          if [ -n "$job_prefix" ]; then
+            echo "=== Deleting Job based on pod status: $job_prefix === (AGENT_NAME: ${AGENT_NAME})"
+            kubectl delete job "$job_prefix" -n "${namespace}" --ignore-not-found=true
+          fi
       done
     fi
 
     # get the current ensembles
     num_ensembles=$(python3 /tools/ensemble_count.py)
-    echo "${num_ensembles} ensembles in the queue"
+    echo "${num_ensembles} ensembles in the queue (global)"
 
-    # get the current jobs
-    num_jobs=$(kubectl get jobs -n "${namespace}" --no-headers | wc -l)
-    echo "${num_jobs} jobs are running"
+    # get the current number of ALL active joshua jobs (rhel9, non-rhel9, etc.)
+    # This counts jobs that are not yet successfully completed or failed.
+    num_all_active_joshua_jobs=$(kubectl get jobs -n "${namespace}" -o jsonpath='{range .items[?(!(@.status.conditions[?(@.type=="Complete"&&@.status=="True")]) && !(@.status.conditions[?(@.type=="Failed"&&@.status=="True")]))]}{.metadata.name}{"\\n"}{end}' 2>/dev/null | grep -Ec "^joshua-(rhel9-)?agent-[0-9]+(-[0-9]+)?$")
+    echo "${num_all_active_joshua_jobs} total active joshua jobs (any type) are running. Global max_jobs: ${max_jobs}."
 
-    # provision more jobs
-    if [ "${num_ensembles}" -gt "${num_jobs}" ]; then
+    new_jobs=0 # Initialize jobs to start this cycle for this scaler
+
+    # Provision more jobs if global ensembles exist and the global max_jobs limit is not reached.
+    if [ "${num_ensembles}" -gt 0 ] && [ "${num_all_active_joshua_jobs}" -lt "${max_jobs}" ]; then
         current_timestamp="$(date +%y%m%d%H%M%S)"
-        new_jobs=$((num_ensembles - num_jobs))
 
-        if [ ${new_jobs} -gt $((max_jobs - num_jobs)) ]; then
-            new_jobs=$((max_jobs - num_jobs))
-        fi
+        # How many slots are available globally before hitting max_jobs
+        slots_available_globally=$((max_jobs - num_all_active_joshua_jobs))
+
+        # Determine how many jobs this scaler instance will attempt to start in this cycle
+        num_to_attempt_this_cycle=${batch_size} # Start with batch_size as the base
 
+        # If MAX_NEW_JOBS is set and is smaller than current num_to_attempt_this_cycle, respect it
         if [ -n "${MAX_NEW_JOBS}" ]; then
-            if [ "${new_jobs}" -gt "${MAX_NEW_JOBS}" ]; then
-                new_jobs=${MAX_NEW_JOBS}
+            if [ "${MAX_NEW_JOBS}" -lt "${num_to_attempt_this_cycle}" ]; then
+                num_to_attempt_this_cycle=${MAX_NEW_JOBS}
             fi
         fi
 
+        # The actual number of new jobs for this scaler is the minimum of what it wants to attempt 
+        # and what's available globally.
+        if [ "${num_to_attempt_this_cycle}" -gt "${slots_available_globally}" ]; then
+            actual_new_jobs_for_this_scaler=${slots_available_globally}
+        else
+            actual_new_jobs_for_this_scaler=${num_to_attempt_this_cycle}
+        fi
+
+        # Ensure we are trying to start a positive number of jobs
+        if [ "${actual_new_jobs_for_this_scaler}" -gt 0 ]; then
+            new_jobs=${actual_new_jobs_for_this_scaler}
+        fi
+
         idx=0
-        echo "Starting ${new_jobs} jobs"
-        while [ $idx -lt ${new_jobs} ]; do
-            if [ -e /tmp/joshua-agent.yaml ]; then
-                rm -f /tmp/joshua-agent.yaml
-            fi
-            i=0
-            while [ $i -lt "${batch_size}" ]; do
-                export JOBNAME_SUFFIX="${current_timestamp}-${idx}"
-                echo "=== Adding $JOBNAME_SUFFIX ==="
-                envsubst </template/joshua-agent.yaml.template >>/tmp/joshua-agent.yaml
-                # add a separator
-                echo "---" >>/tmp/joshua-agent.yaml
-                ((idx++))
-                ((i++))
-                if [ "${idx}" -ge ${new_jobs} ]; then
-                    break
+        if [ "${new_jobs}" -gt 0 ]; then
+            echo "Starting ${new_jobs} jobs"
+            while [ $idx -lt ${new_jobs} ]; do
+                if [ -e /tmp/joshua-agent.yaml ]; then
+                    rm -f /tmp/joshua-agent.yaml
                 fi
+                i=0
+                while [ $i -lt "${batch_size}" ]; do
+                    export JOBNAME_SUFFIX="${current_timestamp}-${idx}"
+                    echo "=== Adding $JOBNAME_SUFFIX ==="
+                    envsubst </template/joshua-agent.yaml.template >>/tmp/joshua-agent.yaml
+                    # add a separator
+                    echo "---" >>/tmp/joshua-agent.yaml
+                    ((idx++))
+                    ((i++))
+                    if [ "${idx}" -ge ${new_jobs} ]; then
+                        break
+                    fi
+                done
+                # /tmp/joshua-agent.yaml contains up to $batch_size entries
+                echo "Starting a batch of ${i} jobs"
+                kubectl apply -f /tmp/joshua-agent.yaml -n "${namespace}"
             done
-            # /tmp/joshua-agent.yaml contains up to $batch_size entries
-            echo "Starting a batch of ${i} jobs"
-            kubectl apply -f /tmp/joshua-agent.yaml -n "${namespace}"
-        done
+        fi
     fi
-    echo "${new_jobs} jobs started"
+    # Standardized log message based on new_jobs calculated for this iteration for this agent type
+    echo "${new_jobs} jobs of type ${AGENT_NAME} were targeted for starting in this iteration."
 
     # check every check_delay seconds
     sleep "${check_delay}"
 done
 exit 0
+