Skip to content

Commit 3f460d2

Browse files
authored
Startup and readiness improvements and fix for monitor verbose readiness logging (#133)
1 parent c989700 commit 3f460d2

File tree

2 files changed

+61
-82
lines changed

2 files changed

+61
-82
lines changed

pubsubplus/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
apiVersion: v2
22
description: Deploy Solace PubSub+ Event Broker Singleton or HA redundancy group onto a Kubernetes Cluster
33
name: pubsubplus
4-
version: 3.3.1
5-
icon: https://solaceproducts.github.io/pubsubplus-kubernetes-quickstart/images/PubSubPlus.png
4+
version: 3.3.2
5+
icon: https://solaceproducts.github.io/pubsubplus-kubernetes-helm-quickstart/images/PubSubPlus.png
66
kubeVersion: '>= 1.10.0-0'
77
maintainers:
88
- name: Solace Community Forum

pubsubplus/templates/solaceConfigMap.yaml

+59-80
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,9 @@ data:
3737
cat /mnt/disks/certs/server/{{.Values.tls.certFilename | default "tls.key"}} /mnt/disks/certs/server/{{.Values.tls.certKeyFilename | default "tls.crt"}} > /dev/shm/server.cert
3838
export tls_servercertificate_filepath="/dev/shm/server.cert"
3939
{{- end }}
40+
# Deal with the fact we cannot accept "-" in router names
41+
export routername=$(echo $(hostname) | sed 's/-//g')
4042
{{- if .Values.solace.redundancy }}
41-
# [TODO] KBARR not using correct method of finding ordinal until we bump min Kubernetes release above 1.8.1
42-
# https://github.com/kubernetes/kubernetes/issues/40651
43-
# node_ordinal=$(STATEFULSET_ORDINAL)
4443
IFS='-' read -ra host_array <<< $(hostname)
4544
node_ordinal=${host_array[-1]}
4645
if [[ ! -z `echo $STATEFULSET_NAMESPACE` ]]; then
@@ -49,9 +48,7 @@ data:
4948
namespace=default
5049
fi
5150
service={{ template "solace.fullname" . }}
52-
# Deal with the fact we cannot accept "-" in routre names
5351
service_name=$(echo ${service} | sed 's/-//g')
54-
export routername=$(echo $(hostname) | sed 's/-//g')
5552
export redundancy_enable=yes
5653
export configsync_enable=yes
5754
export redundancy_authentication_presharedkey_key=`cat /mnt/disks/secrets/username_admin_password | awk '{x=$0;for(i=length;i<51;i++)x=x "0";}END{print x}' | base64` # Right-pad with 0s to 50 length
@@ -92,6 +89,7 @@ data:
9289
loop_guard=60
9390
pause=10
9491
count=0
92+
# Wait for Solace Management API
9593
while [ ${count} -lt ${loop_guard} ]; do
9694
if /mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 -t ; then
9795
break
@@ -131,6 +129,7 @@ data:
131129
resync_step_required=""
132130
role=""
133131
count=0
132+
# Determine node's primary or backup role
134133
while [ ${count} -lt ${loop_guard} ]; do
135134
role_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
136135
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
@@ -147,16 +146,16 @@ data:
147146
;;
148147
esac
149148
((count++))
150-
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's active-standby role"
149+
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, got ${role_results} for this node's primary or backup role"
151150
sleep ${pause}
152151
done
153152
if [ ${count} -eq ${loop_guard} ]; then
154-
echo "`date` ERROR: ${APP}-Could not determine this node's active-standby role" >&2
153+
echo "`date` ERROR: ${APP}-Could not determine this node's primary or backup role" >&2
155154
exit 1
156155
fi
157-
# Determine local activity
156+
echo "`date` INFO: ${APP}-Management API is up, determined that this node's role is: ${role}"
157+
# Determine activity (local or mate active)
158158
count=0
159-
echo "`date` INFO: ${APP}-Management API is up, determined that this node's active-standby role is: ${role}"
160159
while [ ${count} -lt ${loop_guard} ]; do
161160
online_results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
162161
-q "<rpc><show><redundancy><detail/></redundancy></show></rpc>" \
@@ -172,7 +171,7 @@ data:
172171
echo "`date` INFO: ${APP}-Broker initial startup detected. This node will assert config-sync configuration over its mate"
173172
resync_step_required="true"
174173
else
175-
echo "`date` WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Normally expected nodes are Mate Active after restart"
174+
echo "`date` WARN: ${APP}-Unexpected state: this is not an initial startup of the broker and this node reports Local Active. Possibly a redeploy?"
176175
fi
177176
break
178177
;;
@@ -182,15 +181,16 @@ data:
182181
;;
183182
esac
184183
((count++))
185-
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, Local activity state is: ${local_activity}"
184+
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, node activity state is: ${local_activity}"
186185
sleep ${pause}
187186
done
188187
if [ ${count} -eq ${loop_guard} ]; then
189-
echo "`date` ERROR: ${APP}-Local activity state never become Local Active or Mate Active" >&2
188+
echo "`date` ERROR: ${APP}-Node activity state never become Local Active or Mate Active" >&2
190189
exit 1
191190
fi
192-
# If we need to assert leader, then we need to wait for mate to reconcile
191+
# If we need to assert leader, then first wait for mate to report Standby state
193192
if [ "${resync_step_required}" = "true" ]; then
193+
# This branch is AD-active only
194194
count=0
195195
echo "`date` INFO: ${APP}-Waiting for mate activity state to be 'Standby'"
196196
while [ ${count} -lt ${loop_guard} ]; do
@@ -214,7 +214,7 @@ data:
214214
exit 1
215215
fi
216216
fi # if assert-leader
217-
# Ensure Config-sync connection state is Connected before proceeding
217+
# Ensure Config-sync connection state is Connected for both primary and backup before proceeding
218218
count=0
219219
echo "`date` INFO: ${APP}-Waiting for config-sync connected"
220220
while [ ${count} -lt ${loop_guard} ]; do
@@ -239,11 +239,12 @@ data:
239239
fi
240240
# Now can issue assert-leader command
241241
if [ "${resync_step_required}" = "true" ]; then
242-
echo "`date` INFO: ${APP}-Initiating assert-leader"
243-
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
244-
-q "<rpc><admin><config-sync><assert-leader><router/></assert-leader></config-sync></admin></rpc>"
245-
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
246-
-q "<rpc><admin><config-sync><assert-leader><vpn-name>*</vpn-name></assert-leader></config-sync></admin></rpc>"
242+
# This branch is AD-active only
243+
echo "`date` INFO: ${APP}-Initiating assert-leader"
244+
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
245+
-q "<rpc><admin><config-sync><assert-leader><router/></assert-leader></config-sync></admin></rpc>"
246+
/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
247+
-q "<rpc><admin><config-sync><assert-leader><vpn-name>*</vpn-name></assert-leader></config-sync></admin></rpc>"
247248
fi
248249
# Wait for config-sync results
249250
count=0
@@ -263,7 +264,7 @@ data:
263264
((count++))
264265
echo "`date` INFO: ${APP}-Waited ${run_time} seconds, Config-sync is: ${confsyncstatus_results}, not yet Up"
265266

266-
# Additional check to confirm config-sync
267+
# Additional checks to confirm config-sync (even if reported gloabally as not Up, it may be still up between local primary and backup in a DR setup)
267268
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
268269
messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
269270
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
@@ -378,36 +379,15 @@ data:
378379
IFS='-' read -ra host_array <<< $(hostname)
379380
node_ordinal=${host_array[-1]}
380381
password=`cat /mnt/disks/secrets/username_admin_password`
381-
382-
# For update (includes SolOS upgrade) purposes, additional checks are required for readiness state when the pod has been started
383-
# This is an update if the LASTVERSION_FILE with K8s controller-revision-hash exists and contents differ from current value
384-
LASTVERSION_FILE=/var/lib/solace/var/lastConfigRevisionBeforeReboot
385-
if [ ! -f ${LASTVERSION_FILE} ] || [[ $(cat ${LASTVERSION_FILE}) != $(get_label "controller-revision-hash") ]] ; then
386-
echo "`date` INFO: ${APP}-Initial startup or Upgrade detected, running additional checks..."
387-
# Check redundancy
388-
echo "`date` INFO: ${APP}-Running checks. Redundancy state check started..."
389-
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
390-
-q "<rpc><show><redundancy/></show></rpc>" \
391-
-v "/rpc-reply/rpc/show/redundancy/redundancy-status"`
392-
redundancystatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
393-
if [ "${redundancystatus_results}" != "Up" ]; then
394-
echo "`date` INFO: ${APP}-Redundancy state is not yet up."
395-
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
396-
fi
397-
398-
fi
399-
# Record current version in LASTVERSION_FILE
400-
echo $(get_label "controller-revision-hash") > ${LASTVERSION_FILE}
401382
# For monitor node just check for redundancy; active label will never be set
402383
if [ "${node_ordinal}" = "2" ]; then
403384
# Check redundancy
404-
echo "`date` INFO: ${APP}-Running checks. Redundancy state check started..."
405385
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
406386
-q "<rpc><show><redundancy/></show></rpc>" \
407387
-v "/rpc-reply/rpc/show/redundancy/redundancy-status"`
408388
redundancystatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
409389
if [ "${redundancystatus_results}" != "Up" ]; then
410-
echo "`date` INFO: ${APP}-Redundancy state is not yet up."
390+
echo "`date` INFO: ${APP}-Waiting for redundancy up, redundancy state is not yet up."
411391
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
412392
fi
413393
if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
@@ -418,6 +398,7 @@ data:
418398
fi
419399
exit 0
420400
fi # End Monitor Node
401+
# From here only message routing nodes.
421402
# For Primary or Backup nodes set both service readiness (active label) and k8s readiness (exit return value)
422403
health_result=`curl -s -o /dev/null -w "%{http_code}" http://localhost:5550/health-check/guaranteed-active`
423404
case "${health_result}" in
@@ -467,54 +448,52 @@ data:
467448
echo "`date` INFO: ${APP}-Running checks.Redundancy state is not yet up."
468449
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
469450
fi
470-
# Additionally check config-sync status for non-monitoring nodes
471-
if [ "${node_ordinal}" != "2" ]; then
472-
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
473-
-q "<rpc><show><config-sync></config-sync></show></rpc>" \
474-
-v "/rpc-reply/rpc/show/config-sync/status/oper-status"`
475-
confsyncstatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
476-
if [ "${confsyncstatus_results}" != "Up" ]; then
451+
# Check config-sync status
452+
results=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
453+
-q "<rpc><show><config-sync></config-sync></show></rpc>" \
454+
-v "/rpc-reply/rpc/show/config-sync/status/oper-status"`
455+
confsyncstatus_results=`echo ${results} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
456+
if [ "${confsyncstatus_results}" != "Up" ]; then
477457

478-
# Additional check to confirm config-sync
479-
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
458+
# Additional check to confirm config-sync
459+
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Starting additional checks to confirm config-sync locally..."
480460

481-
messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
482-
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
483-
-v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)"`
484-
messagevpn_total=`echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
461+
messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
462+
-q "<rpc><show><config-sync><database/><detail/></config-sync></show></rpc>" \
463+
-v "count(/rpc-reply/rpc/show/config-sync/database/local/tables/table)"`
464+
messagevpn_total=`echo ${messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
485465

486-
# Count message_vpns in-sync and compare with total
487-
localmessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
488-
-q "<rpc><show><config-sync><database/></config-sync></show></rpc>" \
489-
-v "count(//table[sync-state='In-Sync'])"`
490-
local_messagevpn_total_insync=`echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
491-
if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
492-
echo "`date` INFO: ${APP}-Config-sync state is not in-sync locally."
493-
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
494-
fi
466+
# Count message_vpns in-sync and compare with total
467+
localmessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
468+
-q "<rpc><show><config-sync><database/></config-sync></show></rpc>" \
469+
-v "count(//table[sync-state='In-Sync'])"`
470+
local_messagevpn_total_insync=`echo ${localmessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
471+
if [ "$messagevpn_total" -ne "$local_messagevpn_total_insync" ]; then
472+
echo "`date` INFO: ${APP}-Config-sync state is not in-sync locally."
473+
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
474+
fi
495475

496-
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
497-
vpnremotehamate_result=$(get_router_remote_config_state "name")
476+
echo "`date` INFO: ${APP}-Checking Config-sync Setup. Remote config-sync state check starting..."
477+
vpnremotehamate_result=$(get_router_remote_config_state "name")
498478

499-
remote_messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
500-
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
501-
-v "count(//table/source-router[name='$vpnremotehamate_result'])"`
502-
remote_messagevpn_total=`echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
479+
remote_messagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
480+
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
481+
-v "count(//table/source-router[name='$vpnremotehamate_result'])"`
482+
remote_messagevpn_total=`echo ${remote_messagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
503483

504-
#Count message_vpns in-sync, not stale and compare with total
505-
remotemessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
506-
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
507-
-v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])"`
508-
remote_messagevpn_total_insync=`echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
509-
if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
510-
echo "`date` INFO: ${APP}-Config-sync state is not in-sync for remote."
511-
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
512-
fi
484+
#Count message_vpns in-sync, not stale and compare with total
485+
remotemessagevpn_result=`/mnt/disks/solace/semp_query.sh -n admin -p ${password} -u http://localhost:8080 \
486+
-q "<rpc><show><config-sync><database/><remote/></config-sync></show></rpc>" \
487+
-v "count(//table/source-router[name='$vpnremotehamate_result' and sync-state='In-Sync' and stale='No'])"`
488+
remote_messagevpn_total_insync=`echo ${remotemessagevpn_result} | xmllint -xpath "string(returnInfo/valueSearchResult)" -`
489+
if [ "$remote_messagevpn_total" -ne "$remote_messagevpn_total_insync" ]; then
490+
echo "`date` INFO: ${APP}-Config-sync state is not in-sync for remote."
491+
rm -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}; exit 1
513492
fi
514493
fi
515494
# Pass readiness check
516495
if [ ! -f ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE} ]; then
517-
echo "`date` INFO: ${APP}-Redundancy is up and node is mate Active"
496+
echo "`date` INFO: ${APP}-Redundancy is up and node is Mate Active"
518497
touch ${FINAL_ACTIVITY_LOGGED_TRACKING_FILE}
519498
echo "`date` INFO: ${APP}-Server status check complete for this broker node"
520499
exit 1

0 commit comments

Comments
 (0)