-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrollout_ainodes_asg.sh
More file actions
executable file
·168 lines (147 loc) · 6.21 KB
/
rollout_ainodes_asg.sh
File metadata and controls
executable file
·168 lines (147 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/bin/bash
# This script cordon ainodes-fix-group nodes and reschedules all pods :
# - It rollout ainodes in provided order
# - It rollout rtmp stack by stack
# - It drains everything left on the ASG
# Old nodes will stay unschedulable until being removed by cluster-autoscaler
# Bash strict mode
set -euo pipefail
# Constants
NORMAL="\033[0m"
GREEN="\033[0;32m"
YELLOW="\033[0;33m"
# List of the ainodes/mongo/backend to reschedule in order, written <name>,<suffix>,<pod manager used>,<should the script wait after>
AINODES_BATCHS=(
"lbalancer-main,-ainode,deployment,true"
"dynamicrouter-main,-ainode,deployment,true"
"drmmanager-main,-ainode,deployment,true"
"encrypt-main,-ainode,deployment,true"
"dashmanifestgen-main,-ainode,deployment,false"
"hlsmanifestgen-main,-ainode,deployment,false"
"packager-main,-ainode,deployment,true"
"xaudio-main,-ainode,deployment,false"
"xsubtitles-main,-ainode,deployment,false"
"xcode-ska,-ainode,deployment,true"
"xcode-main,-ainode,deployment,false"
"xcode-backup,-ainode,deployment,true"
"segmenter-main,-ainode,deployment,false"
)
# Programm arguments parsing
RTMP_STACKS=()
NAMESPACE=""
WORKFLOWPOOL=""
CORDON_ASG="true"
DRAIN_ASG="true"
AINODE_WAIT_TIME="120"
RTMP_WAIT_TIME="30"
AINODE_NODEGROUP_SELECTOR="group=ainodes-fix-group"
function help() {
cat <<EOF
This script reschedules all pods on ainodes-fix-group :
- It rollout ainodes in provided order
- It rollout rtmp stack in provided order
- It drains everything left on the ASG
Usage : $0 -n NAMESPACE -w WORKFLOWPOOL [options]
Mandatory arguments :
-n NAMESPACE Set namespace of the workflowpool.
-w WORKFLOWPOOL : Set the worflowpool name.
Available options :
-h Display this help.
-c Should the script cordon all ainode nodes beforehand, by default ${CORDON_ASG}.
-d Should the script drain all ainode cordonned nodes afterward, by default ${DRAIN_ASG}.
-s Add a rtmp stack to rollout splitted, can be used several times, by default none.
-t Override the default time in seconds between ainodes batch, by default ${AINODE_WAIT_TIME}.
-r Override the default time in seconds between rtmp stacks, by default ${RTMP_WAIT_TIME}.
-l Override Ainode nodegroup label, by default ${AINODE_NODEGROUP_SELECTOR}.
EOF
}
while getopts ":n:w:c:d:t:s:r:l:h" opt; do
case "$opt" in
h)
help
exit 0
;;
n) NAMESPACE="${OPTARG}" ;;
w) WORKFLOWPOOL="${OPTARG}" ;;
c) CORDON_ASG="${OPTARG}" ;;
d) DRAIN_ASG="${OPTARG}" ;;
s) RTMP_STACKS+=("${OPTARG}") ;;
t) AINODE_WAIT_TIME="${OPTARG}" ;;
r) RTMP_WAIT_TIME="${OPTARG}" ;;
l) AINODE_NODEGROUP_SELECTOR="${OPTARG}" ;;
*)
echo "Unsupported flag provided : ${OPTARG}".
help
exit 1
;;
esac
done
if [ -z "${NAMESPACE}" ]; then
echo "Namespace was not specified, aborting"
exit 1
fi
if [ -z "${WORKFLOWPOOL}" ]; then
echo "WorkflowPool was not specified, aborting"
exit 1
fi
if [ -z "$(kubectl get namespace ${NAMESPACE})" ]; then
echo "Namespace ${NAMESPACE} does not exist, exiting."
exit 1
fi
echo -e "${YELLOW}This script will take the following actions :${NORMAL}"
if [[ "${CORDON_ASG}" == "true" ]]; then
echo "* nodes matching \"${AINODE_NODEGROUP_SELECTOR}\" will be cordonned"
fi
echo "* ainodes of workflow \"${WORKFLOWPOOL}\" in \"${NAMESPACE}\" will be rollouted in order waiting ${AINODE_WAIT_TIME}s between batchs"
echo "* rtmp stacks [${RTMP_STACKS[*]+${RTMP_STACKS[*]}}] in \"${NAMESPACE}\" will be rollouted in order waiting ${RTMP_WAIT_TIME}s between batchs"
if [[ "${DRAIN_ASG}" == "true" ]]; then
echo "* cordonned nodes matching \"${AINODE_NODEGROUP_SELECTOR}\" will be drained"
fi
read -rp "Continue? [y/n] " answer
if [[ "${answer}" != "y" ]]; then
echo "Did not receive [y], exiting."
exit 1
fi
if [[ "${CORDON_ASG}" == "true" ]]; then
echo -e "${GREEN}Cordon all ainode nodes ${NORMAL}"
kubectl cordon --selector "${AINODE_NODEGROUP_SELECTOR}"
fi
# Reschedule workflow deployments in order
echo -e "${GREEN}Rescheduling all ainodes...${NORMAL}"
for deploy in "${AINODES_BATCHS[@]}"; do
name=$(echo "${deploy}" | cut -d "," -f 1)
type=$(echo "${deploy}" | cut -d "," -f 2)
kind=$(echo "${deploy}" | cut -d "," -f 3)
wait=$(echo "${deploy}" | cut -d "," -f 4)
echo -e "${GREEN}Rescheduling ${kind} ${name} ${NORMAL}"
kubectl rollout restart "${kind}/${WORKFLOWPOOL}-${name}${type}" -n "${NAMESPACE}"
kubectl rollout status "${kind}/${WORKFLOWPOOL}-${name}${type}" -n "${NAMESPACE}"
if [[ $wait == "true" ]]; then
echo -e "${YELLOW}Ainode needs to populate cache, waiting ${AINODE_WAIT_TIME} seconds ${NORMAL}"
sleep "${AINODE_WAIT_TIME}"
fi
done
# Reschedule rtmp deployments stack by stack
# This is a naive method, which should not affect HA rtmp stream, but has 43 + 12s of black due to jumping to a loadbalancer to the next.
# It has been choosen over doing them separately since util preStop is implemented it only increase the global black duration (35 + 45s).
for stack in ${RTMP_STACKS[@]+"${RTMP_STACKS[@]}"}; do
echo -e "${GREEN}Rescheduling all rtmp loadbalancers and handlers related to stack ${stack}...${NORMAL}"
kubectl -n "${NAMESPACE}" rollout restart deployment \
--selector "app.kubernetes.io/name in (rtmp-loadbalancer,rtmp-handler),app.kubernetes.io/instance=${stack}"
kubectl -n "${NAMESPACE}" rollout status deployment \
--selector "app.kubernetes.io/name in (rtmp-loadbalancer,rtmp-handler),app.kubernetes.io/instance=${stack}"
sleep "${RTMP_WAIT_TIME}"
done
# Drain all currently cordonned nodes
# The pods left on the ASG should be drainable in parallel : mongodb (has PDB), traefik (has PDB), backends (non-critical).
if [[ "${DRAIN_ASG}" == "true" ]]; then
echo -e "${GREEN}Finally drain all ainodes already cordonned nodes...${NORMAL}"
read -ra nodes_to_rollout < <(
kubectl get nodes \
--selector "${AINODE_NODEGROUP_SELECTOR}" \
--field-selector spec.unschedulable=true \
-o jsonpath="{.items[*]['metadata.name']}{'\n'}"
)
echo -e "${YELLOW}The following nodes will be drained : ${nodes_to_rollout[*]}${NORMAL} "
kubectl drain --ignore-daemonsets --delete-emptydir-data "${nodes_to_rollout[@]}"
fi