Skip to content

Commit ec1e726

Browse files
bczomaPhilippeKhalife
authored andcommitted
Robustness improvements for rolling upgrade. Also fixes issue #33 (#46)
* Additional checks for readiness, increase config-sync-check timeout, enable local debug.log
1 parent 1b35825 commit ec1e726

File tree

2 files changed

+69
-11
lines changed

2 files changed

+69
-11
lines changed

.travis.yml

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ script:
4343
- kubectl get statefulset,svc,pods,pvc,pv --show-labels
4444
- echo "Waiting for cluster to become active"
4545
- "travis_wait 30 sleep 1800 &"
46+
- sleep 40; kubectl describe nodes # This shall show resource availability
4647
- until kubectl get pods --show-labels | grep solace-0 | grep -m 1 -E 'active=true'; do sleep 10; done
4748
- until kubectl get pods --show-labels | grep solace-1 | grep -m 1 -E '1/1'; do sleep 10; done
4849
- until kubectl get pods --show-labels | grep solace-2 | grep -m 1 -E '1/1'; do sleep 10; done

solace/templates/solaceConfigMap.yaml

+68-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
---
21
apiVersion: v1
32
kind: ConfigMap
43
metadata:
@@ -17,7 +16,7 @@ data:
1716
export service_webtransport_port='60080'
1817
export service_webtransport_tlsport='60443'
1918
export service_semp_tlsport='60943'
20-
export logging_debug_output=stdout
19+
export logging_debug_output=all
2120
{{- if eq .Values.solace.size "dev100" }}
2221
export system_scaling_maxconnectioncount="100"
2322
{{- else if eq .Values.solace.size "prod100" }}
@@ -83,7 +82,7 @@ data:
8382
IFS='-' read -ra host_array <<< $(hostname)
8483
node_ordinal=${host_array[-1]}
8584
password=`cat {{ .Values.filepaths.secrets }}/username_admin_password`
86-
loop_guard=30
85+
loop_guard=60
8786
pause=10
8887
count=0
8988
resync_step=""
@@ -93,7 +92,7 @@ data:
9392
echo "`date` INFO: ${APP}-Determine if Active or Backup role to know which Virtual router to check"
9493
while [ ${count} -lt ${loop_guard} ]; do
9594
role_results=`{{ .Values.filepaths.configmap }}/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
96-
-q "<rpc semp-version='soltr/8_5VMR'><show><redundancy><detail/></redundancy></show></rpc>" \
95+
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
9796
-v "/rpc-reply/rpc/show/redundancy/active-standby-role[text()]"`
9897
case "`echo ${role_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`" in
9998
"Primary")
@@ -118,7 +117,7 @@ data:
118117
echo "`date` INFO: ${APP}-Active or Backup role is ${role}"
119118
while [ ${count} -lt ${loop_guard} ]; do
120119
online_results=`{{ .Values.filepaths.configmap }}/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
121-
-q "<rpc semp-version='soltr/8_5VMR'><show><redundancy><detail/></redundancy></show></rpc>" \
120+
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
122121
-v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/activity[text()]"`
123122
local_activity=`echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
124123
echo "`date` INFO: ${APP}-Local activity state is: ${local_activity}"
@@ -155,7 +154,7 @@ data:
155154
echo "`date` INFO: ${APP}-Wait for mate to be 'Standby'"
156155
while [ ${count} -lt ${loop_guard} ]; do
157156
online_results=`{{ .Values.filepaths.configmap }}/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
158-
-q "<rpc semp-version='soltr/8_5VMR'><show><redundancy><detail/></redundancy></show></rpc>" \
157+
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
159158
-v "/rpc-reply/rpc/show/redundancy/virtual-routers/${role}/status/detail/priority-reported-by-mate/summary[text()]"`
160159
mate_activity=`echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
161160
echo "`date` INFO: ${APP}-Mate activity state is: ${mate_activity}"
@@ -177,9 +176,9 @@ data:
177176
fi # if assert-master
178177
# If only needs to do is resync master, can issue cammand and exit.
179178
{{ .Values.filepaths.configmap }}/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
180-
-q "<rpc semp-version='soltr/8_5VMR'><admin><config-sync><${resync_step}><router/></${resync_step}></config-sync></admin></rpc>"
179+
-q "<rpc><admin><config-sync><${resync_step}><router/></${resync_step}></config-sync></admin></rpc>"
181180
{{ .Values.filepaths.configmap }}/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
182-
-q "<rpc semp-version='soltr/8_5VMR'><admin><config-sync><${resync_step}><vpn-name>default</vpn-name></${resync_step}></config-sync></admin></rpc>"
181+
-q "<rpc><admin><config-sync><${resync_step}><vpn-name>default</vpn-name></${resync_step}></config-sync></admin></rpc>"
183182
echo "`date` INFO: ${APP}-Solace VMR bringup complete"
184183
fi # if not monitor
185184
{{- end }}
@@ -220,13 +219,71 @@ data:
220219
node_ordinal=${host_array[-1]}
221220
echo "`date` INFO: ${APP}-node ordinal: ${node_ordinal}"
222221
password=`cat {{ .Values.filepaths.secrets }}/username_admin_password`
222+
223+
# For upgrade purposes, ensure redundancy is up only when the pod is started
224+
redundacycheck_file=/tmp/redundacycheck
225+
if [ ! -f ${redundacycheck_file} ]; then
226+
# First check all nodes are online
227+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
228+
-q "<rpc><show><redundancy><group/></redundancy></show></rpc>" \
229+
-c "/rpc-reply/rpc/show/redundancy/group-node/status[text() = \"Online\"]"`
230+
nr_node_results=`echo ${role_results} | xmllint -xpath "string(returnInfo/countSearchResult)" -`
231+
if [ $nr_node_results -ne 3 ]; then
232+
echo "`date` INFO: ${APP}-Not all nodes are online. Query results: ${nr_node_results}"
233+
exit 1
234+
fi
235+
# Then for each node determine the ip address and check redundancy. Note: id starts here from 1 and not 0.
236+
for id in 1 2 3; do
237+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
238+
-q "<rpc><show><redundancy><group/></redundancy></show></rpc>" \
239+
-v "//ip-address[$id]"`
240+
node_ip_address=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
241+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://$node_ip_address:8080/SEMP \
242+
-q "<rpc><show><redundancy/></show></rpc>" \
243+
-v "/rpc-reply/rpc/show/redundancy/redundancy-status"`
244+
redundancystatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
245+
if [ "${redundancystatus_results}" != "Up" ]; then
246+
echo "`date` INFO: ${APP}-Redundancy state is not yet up. Query results: ${redundancystatus_results}"
247+
exit 1
248+
fi
249+
done
250+
# Additionally check config-sync status for non-monitoring nodes
251+
if [ "${node_ordinal}" != "2" ]; then
252+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
253+
-q "<rpc><show><config-sync></config-sync></show></rpc>" \
254+
-v "/rpc-reply/rpc/show/config-sync/status/oper-status"`
255+
confsyncstatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
256+
if [ "${confsyncstatus_results}" != "Up" ]; then
257+
echo "`date` INFO: ${APP}-Config-sync state is not yet up. Query results: ${confsyncstatus_results}"
258+
exit 1
259+
fi
260+
fi
261+
# Then for each node check that they report 3 Consul voters.
262+
for id in 1 2 3; do
263+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
264+
-q "<rpc><show><redundancy><group/></redundancy></show></rpc>" \
265+
-v "//ip-address[$id]"`
266+
node_ip_address=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
267+
nr_voter_results=`curl --unix-socket /var/run/consul -s http://$node_ip_address:8500/v1/operator/raft/configuration | python -m json.tool | grep Voter | grep true | wc -l`
268+
if [ $nr_voter_results -ne 3 ]; then
269+
echo "`date` INFO: ${APP}-Still waiting for all 3 Consul voters to be present for node $node_ip_address. Query results: ${nr_voter_results}"
270+
exit 1
271+
fi
272+
done
273+
echo "Creating redundacycheck_file"
274+
echo "true" > ${redundacycheck_file}
275+
# wait at first startup for stability
276+
#sleep 20
277+
fi
278+
279+
223280
if [ "${node_ordinal}" = "2" ]; then
224281
echo "`date` INFO: ${APP}-Monitor node ready check"
225282
# Note that when dealing with Monitor, only need to be concerned and readiness response.
226283
# active label will always be "false"
227284
echo "`date` INFO: ${APP}-For monitor node just check for 3 online nodes in group"
228285
role_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
229-
-q "<rpc semp-version='soltr/8_5VMR'><show><redundancy><group/></redundancy></show></rpc>" \
286+
-q "<rpc><show><redundancy><group/></redundancy></show></rpc>" \
230287
-c "/rpc-reply/rpc/show/redundancy/group-node/status[text() = \"Online\"]"`
231288
if [ `echo ${role_results} | xmllint -xpath "string(returnInfo/countSearchResult)" -` -eq 3 ]; then
232289
echo "`date` INFO: ${APP}-Monitor node is redundancy ready"
@@ -270,7 +327,7 @@ data:
270327
;;
271328
esac
272329
online_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080/SEMP \
273-
-q "<rpc semp-version='soltr/8_5VMR'><show><redundancy><detail/></redundancy></show></rpc>" \
330+
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
274331
-v "/rpc-reply/rpc/show/redundancy/virtual-routers/${config_role}/status/activity[text()]"`
275332
local_activity=`echo ${online_results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
276333
echo "`date` INFO: ${APP}-Local activity state is: ${local_activity}"
@@ -385,4 +442,4 @@ data:
385442
echo -e "`date` INFO: ${APP}-${script_name}: \n\t count search: $count_search \n\t count_line: ${count_line} \n\t count_string: ${count_string} \n\t count_result: ${count_result}" >&2
386443
echo "<returnInfo><errorInfo></errorInfo><countSearchResult>${count_result}</countSearchResult></returnInfo>"
387444
exit 0
388-
fi
445+
fi

0 commit comments

Comments
 (0)