Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions jenkins/jobs/rotate-nomad-pool.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
name: MEMORY_IN_GBS
description: "Memory to use"
trim: true
- string:
name: ROLE
description: "Role for pool, defaults to nomad-pool"
trim: true
- string:
name: POOL_TYPE
description: "Pool type, defaults to general"
trim: true
- string:
name: INFRA_CONFIGURATION_REPO
default: [email protected]:jitsi/infra-configuration.git
Expand Down
47 changes: 34 additions & 13 deletions scripts/rotate-nomad-pool-oracle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ fi
# if no load balancer is available to detect bootup health then
# wait a fixed period between instances when rotating
[ -z "$STARTUP_GRACE_PERIOD_SECONDS" ] && STARTUP_GRACE_PERIOD_SECONDS=300 # 5 min
[ -z "$STARTUP_TIMEOUT_PERIOD_SECONDS" ] && STARTUP_TIMEOUT_PERIOD_SECONDS=1200 # 20 min

if [ -z "$ORACLE_REGION" ]; then
echo "No ORACLE_REGION found. Exiting..."
Expand Down Expand Up @@ -74,16 +75,18 @@ else
done
fi

# first apply changes to instance configuration, etc
$LOCAL_PATH/../terraform/nomad-pool/create-nomad-pool-stack.sh $SSH_USER
if [[ "$POOL_TYPE" == "general" ]]; then
# first apply changes to instance configuration, etc
$LOCAL_PATH/../terraform/nomad-pool/create-nomad-pool-stack.sh $SSH_USER

if [ $? -gt 0 ]; then
echo -e "\n## Nomad pool configuration update failed, exiting"
exit 5
if [ $? -gt 0 ]; then
echo -e "\n## Nomad pool configuration update failed, exiting"
exit 5
fi
fi


ENVIRONMENT=$ENVIRONMENT ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py inventory
ENVIRONMENT=$ENVIRONMENT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py inventory

# check if the load balancer is healthy before proceeding
if [ ! -z "$LOAD_BALANCER_ID" ]; then
Expand All @@ -93,11 +96,15 @@ else
echo "State of existing load balancer for nomad instance pool is not ok; exiting before scale up."
exit 5
fi
else
# pull the nomad client list to use when checking for new nodes later
NOMAD_READY_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
NOMAD_READY_CLIENTS_COUNT=$(echo $NOMAD_READY_CLIENTS | jq -r ".|length")
fi

# next scale up by 2X
echo -e "\n## rotate-nomad-poool-oracle: double the size of nomad pool"
ENVIRONMENT=$ENVIRONMENT ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py double --wait
ENVIRONMENT=$ENVIRONMENT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py double --wait

# wait for load balancer to see new instances go healthy
if [ ! -z "$LOAD_BALANCER_ID" ]; then
Expand Down Expand Up @@ -126,13 +133,27 @@ else
fi
else
# No load balancer to detect healthy state, so wait for fixed duration before continuing
if [[ $i -lt $((INSTANCE_COUNT-1)) ]]; then
echo "Waiting for $STARTUP_GRACE_PERIOD_SECONDS seconds before rotating next instance"
sleep $STARTUP_GRACE_PERIOD_SECONDS
fi
echo "Waiting for $STARTUP_GRACE_PERIOD_SECONDS seconds before beginning check for nodes in nomad"
sleep $STARTUP_GRACE_PERIOD_SECONDS
NOMAD_NEW_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
NOMAD_NEW_CLIENTS_COUNT=$(echo $NOMAD_NEW_CLIENTS | jq -r ".|length")
SLEEP_TIME=$((STARTUP_TIMEOUT_PERIOD_SECONDS - STARTUP_GRACE_PERIOD_SECONDS))
WAIT_TOTAL=0
while [[ "$NOMAD_NEW_CLIENTS_COUNT" -lt "$((NOMAD_READY_CLIENTS_COUNT*2))" ]]; do
if [ $WAIT_TOTAL -gt $SLEEP_TIME ]; then
echo "Exceeding max waiting time of $SLEEP_TIME seconds for the nomad client state to reach expected count of $((NOMAD_READY_CLIENTS_COUNT*2)). Current count is $NOMAD_NEW_CLIENTS_COUNT. Something is wrong, exiting..."
exit 224
fi

echo "Waiting for the nomad client state to reach expected count of $((NOMAD_READY_CLIENTS_COUNT*2)). Current count is $NOMAD_NEW_CLIENTS_COUNT."
sleep $WAIT_INTERVAL_SECONDS
WAIT_TOTAL=$((WAIT_TOTAL + WAIT_INTERVAL_SECONDS))
NOMAD_NEW_CLIENTS="$($LOCAL_PATH/nomad.sh node status -json -filter "NodeClass==$POOL_TYPE" | jq ".|map(select(.Datacenter==\"${ENVIRONMENT}-${ORACLE_REGION}\" and .Status==\"ready\"))")"
NOMAD_NEW_CLIENTS_COUNT=$(echo $NOMAD_NEW_CLIENTS | jq -r ".|length")
done
fi

DETACHABLE_IPS=$(ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=2 ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --onlyip)
DETACHABLE_IPS=$(ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=$INSTANCE_COUNT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --onlyip)
if [ -z "$DETACHABLE_IPS" ]; then
echo "## ERROR: No IPs found to detach, something went wrong..."
exit 226
Expand All @@ -159,6 +180,6 @@ else

# scale down the old instances
echo -e "\n## rotate-nomad-poool-oracle: halve the size of nomad instance pool"
ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=2 ROLE=nomad-pool INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --wait
ENVIRONMENT=$ENVIRONMENT MINIMUM_POOL_SIZE=$INSTANCE_COUNT ROLE=$ROLE INSTANCE_POOL_ID=$INSTANCE_POOL_ID ORACLE_REGIONS=$ORACLE_REGION $LOCAL_PATH/pool.py halve --wait

fi