jitsi · aaronkvanmeerten · May 7, 2025 · May 7, 2025 · May 7, 2025 · May 9, 2025
diff --git a/jenkins/jobs/rotate-nomad-pool.yaml b/jenkins/jobs/rotate-nomad-pool.yaml
@@ -35,6 +35,14 @@
           name: MEMORY_IN_GBS
           description: "Memory to use"
           trim: true
+      - string:
+          name: ROLE
+          description: "Role for pool, defaults to nomad-pool"
+          trim: true
+      - string:
+          name: POOL_TYPE
+          description: "Pool type, defaults to general"
+          trim: true
       - string:
           name: INFRA_CONFIGURATION_REPO
           default: [email protected]:jitsi/infra-configuration.git

diff --git a/scripts/rotate-nomad-pool-oracle.sh b/scripts/rotate-nomad-pool-oracle.sh
@@ -20,6 +20,7 @@ fi
 # if no load balancer is available to detect bootup health then
 # wait a fixed period between instances when rotating
 [ -z "$STARTUP_GRACE_PERIOD_SECONDS" ] && STARTUP_GRACE_PERIOD_SECONDS=300 # 5 min
+[ -z "$STARTUP_TIMEOUT_PERIOD_SECONDS" ] && STARTUP_TIMEOUT_PERIOD_SECONDS=1200 # 20 min
 
 if [ -z "$ORACLE_REGION" ]; then
   echo "No ORACLE_REGION found.  Exiting..."
@@ -74,16 +75,18 @@ else
     done
   fi
 
-  # first apply changes to instance configuration, etc
-  $LOCAL_PATH/../terraform/nomad-pool/create-nomad-pool-stack.sh $SSH_USER
+  if [[ "$POOL_TYPE" == "general" ]]; then
+    # first apply changes to instance configuration, etc
+    $LOCAL_PATH/../terraform/nomad-pool/create-nomad-pool-stack.sh $SSH_USER
 
-  if [ $? -gt 0 ]; then
-    echo -e "\n## Nomad pool configuration update failed, exiting"
-    exit 5
+    if [ $? -gt 0 ]; then
+      echo -e "\n## Nomad pool configuration update failed, exiting"
+      exit 5
+    fi
   fi
 
 
-  ENVIRONMENT=$ENVIRONMENT ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py inventory
+  ENVIRONMENT=$ENVIRONMENT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py inventory
 
   # check if the load balancer is healthy before proceeding
   if [ ! -z "$LOAD_BALANCER_ID" ]; then
@@ -93,11 +96,15 @@ else
       echo "State of existing load balancer for nomad instance pool is not ok; exiting before scale up."
       exit 5
     fi
+  else
+    # pull the nomad client list to use when checking for new nodes later
+    NOMAD_READY_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
+    NOMAD_READY_CLIENTS_COUNT=$(echo $NOMAD_READY_CLIENTS | jq -r ".|length")
   fi
 
   # next scale up by 2X
   echo -e "\n## rotate-nomad-poool-oracle: double the size of nomad pool"
-  ENVIRONMENT=$ENVIRONMENT ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py double --wait
+  ENVIRONMENT=$ENVIRONMENT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py double --wait
 
   # wait for load balancer to see new instances go healthy
   if [ ! -z "$LOAD_BALANCER_ID" ]; then
@@ -126,13 +133,27 @@ else
     fi
   else
     # No load balancer to detect healthy state, so wait for fixed duration before continuing
-    if [[ $i -lt $((INSTANCE_COUNT-1)) ]]; then
-      echo "Waiting for $STARTUP_GRACE_PERIOD_SECONDS seconds before rotating next instance"
-      sleep $STARTUP_GRACE_PERIOD_SECONDS
-    fi
+    echo "Waiting for $STARTUP_GRACE_PERIOD_SECONDS seconds before beginning check for nodes in nomad"
+    sleep $STARTUP_GRACE_PERIOD_SECONDS
+    NOMAD_NEW_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
+    NOMAD_NEW_CLIENTS_COUNT=$(echo $NOMAD_NEW_CLIENTS | jq -r ".|length")
+    SLEEP_TIME=$((STARTUP_TIMEOUT_PERIOD_SECONDS - STARTUP_GRACE_PERIOD_SECONDS))
+    WAIT_TOTAL=0
+    while [[ "$NOMAD_NEW_CLIENTS_COUNT" -lt "$((NOMAD_READY_CLIENTS_COUNT*2))" ]]; do
+      if [ $WAIT_TOTAL -gt $SLEEP_TIME ]; then
+        echo "Exceeding max waiting time of $SLEEP_TIME seconds for the nomad client state to reach expected count of $((NOMAD_READY_CLIENTS_COUNT*2)). Current count is $NOMAD_NEW_CLIENTS_COUNT. Something is wrong, exiting..."
+        exit 224
+      fi
+
+      echo "Waiting for the nomad client state to reach expected count of $((NOMAD_READY_CLIENTS_COUNT*2)). Current count is $NOMAD_NEW_CLIENTS_COUNT."
+      sleep $WAIT_INTERVAL_SECONDS
+      WAIT_TOTAL=$((WAIT_TOTAL + WAIT_INTERVAL_SECONDS))
+      NOMAD_NEW_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
+      NOMAD_NEW_CLIENTS_COUNT=$(echo $NOMAD_NEW_CLIENTS | jq -r ".|length")
+    done
   fi
 
-  DETACHABLE_IPS=$(ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=2 ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --onlyip)
+  DETACHABLE_IPS=$(ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=$INSTANCE_COUNT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --onlyip)
   if [ -z "$DETACHABLE_IPS" ]; then
     echo "## ERROR: No IPs found to detach, something went wrong..."
     exit 226
@@ -159,6 +180,6 @@ else
 
   # scale down the old instances
   echo -e "\n## rotate-nomad-poool-oracle: halve the size of nomad instance pool"
-  ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=2 ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --wait
+  ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=$INSTANCE_COUNT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --wait
 
 fi