@@ -23,6 +23,9 @@ All_PODS_READY=true
2323install_kubevirt=1
2424TRANSITION_PIPE=" /tmp/cluster_transition_pipe$$ "
2525TRANSITION_FLAG_FILE=" /tmp/cluster_transition_flag"
26+ RebootReasonFile=" /persist/reboot-reason"
27+ BootReasonFile=" /persist/boot-reason"
28+ BootReasonKubeTransition=" BootReasonKubeTransition" # Must match string in types package
2629
2730# shellcheck source=pkg/kube/pubsub.sh
2831. /usr/bin/pubsub.sh
@@ -404,6 +407,32 @@ are_all_pods_ready() {
404407 return 0
405408}
406409
410+ # Reboot the system with a recorded reason
411+ # Usage: reboot_with_reason "reason string"
412+ # The "BootReasonKubeTransition" will be written to /persist/boot-reason and
413+ # the reason will be written to /persist/reboot-reason before rebooting
414+ reboot_with_reason () {
415+ local reason=" $1 "
416+ local timestamp
417+ timestamp=$( date ' +%Y-%m-%d %H:%M:%S' )
418+
419+ if [ -z " $reason " ]; then
420+ reason=" kube cluster conversion reboot"
421+ fi
422+
423+ logmsg " Rebooting system: $reason "
424+ if [ ! -f " $BootReasonFile " ]; then
425+ echo " $BootReasonKubeTransition " > " $BootReasonFile "
426+ fi
427+ echo " [$timestamp ]: $BootReasonKubeTransition , $reason " >> " $RebootReasonFile "
428+
429+ # Sync to ensure the file is written to disk
430+ sync
431+ sleep 1 # Give sync a moment to complete
432+ # Perform the reboot
433+ reboot
434+ }
435+
407436# run virtctl vnc
408437check_and_run_vnc () {
409438 pid=$( pgrep -f " /usr/bin/virtctl vnc" )
@@ -452,18 +481,27 @@ check_and_run_vnc() {
452481
453482# get the EdgeNodeClusterStatus
454483enc_status_file=" /run/zedkube/EdgeNodeClusterStatus/global.json"
484+ # If the node is part of a cluster, even if the case of only one node in the cluster
485+ # the clusrter_intf, is_bootstrap, join_serverIP, cluster_token, cluster_node_ip
486+ # cluster_uuid are all obtained from the enc_status_file published by zedkube;
487+ # When the kubernetes node is in 'single node' mode, these variables are empty
455488cluster_intf=" "
456489is_bootstrap=" "
457490join_serverIP=" "
458491cluster_token=" "
459492cluster_node_ip=" "
493+ cluster_uuid=" "
460494convert_to_single_node=false
461495
462496# get the EdgeNodeClusterStatus from zedkube publication
497+ # Return values:
498+ # 0 - Success: file exists and all validations passed
499+ # 1 - File exists but validation failed (incomplete/invalid data)
500+ # 2 - File does not exist
463501get_enc_status () {
464502 # Read the JSON data from the file, return 0 if successful, 1 if not
465503 if [ ! -f " $enc_status_file " ]; then
466- return 1
504+ return 2
467505 fi
468506
469507 enc_data=$( cat " $enc_status_file " )
@@ -473,8 +511,10 @@ get_enc_status() {
473511 cluster_token=$( echo " $enc_data " | jq -r ' .EncryptedClusterToken' )
474512 cluster_node_ip=$( echo " $enc_data " | jq -r ' .ClusterIPPrefix.IP' )
475513 cluster_node_ip_is_ready=$( echo " $enc_data " | jq -r ' .ClusterIPIsReady' )
514+ cluster_uuid=$( echo " $enc_data " | jq -r ' .ClusterID.UUID' )
476515 if [ -n " $cluster_intf " ] && [ -n " $join_serverIP " ] && [ -n " $cluster_token " ] && \
477516 [ -n " $cluster_node_ip " ] && [ " $cluster_node_ip_is_ready " = " true" ] && \
517+ [ -n " $cluster_uuid " ] && [ " $cluster_uuid " != " null" ] && \
478518 { [ " $is_bootstrap " = " true" ] || [ " $is_bootstrap " = " false" ]; }; then
479519 return 0
480520 else
@@ -544,15 +584,55 @@ change_to_new_token() {
544584
545585# monitor function to check if the cluster mode has changed, either from single node to cluster
546586# or from cluster to single node
587+ #
588+ # Return values:
589+ # 0 - No action needed or transition initiated successfully
590+ #
591+ # Operational Cases:
592+ #
593+ # 1. NOT INITIALIZED: Skip checks until /var/lib/all_components_initialized exists
594+ #
595+ # 2. CLUSTER-TO-SINGLE TRANSITION (enc_status=2, enc_status_file missing):
596+ # - If not in cluster mode: no action
597+ # - Otherwise: cleanup registration, mark for single-node conversion, REBOOT
598+ #
599+ # 3. SINGLE-TO-CLUSTER TRANSITION (enc_status=0, no edge-node-cluster-mode flag):
600+ # - EdgeNodeClusterStatus valid AND node was in single mode
601+ # - Wait loop until valid enc_status received
602+ # - Mark node as cluster mode before config changes
603+ # - If zks registration exists: uninstall cluster components (kubevirt, longhorn) first
604+ # - Bootstrap node case: rotate k3s token to controller-provided token
605+ # - Remove old multus config, reassign with cluster node IP
606+ # - Remove node labels for reapplication
607+ # - Create transition pipe/flag for k3s restart coordination
608+ # - Terminate k3s process
609+ # - Non-bootstrap node join case: remove TLS certs, mark debuguser for reinit
610+ # - Provision cluster config (bootstrap or join mode)
611+ # - If enc_status_file disappears during wait for joining cluster: revert back to single-node, REBOOT
612+ # - Non-bootstrap: create transition tracking file with timestamp, if joining cluster fails repeatedly, may REBOOT
613+ # - Bootstrap: wait for k3s to start
614+ # - Signal k3s restart via pipe, cleanup transition flag
615+ #
616+ # 4. ALREADY IN DESIRED MODE: No action taken
617+ #
618+ # 5. POST-CONVERSION REGISTRATION: If base-k3s-mode flag exists, uninstall kubevirt, longhorn, apply registration
619+ #
620+ # REBOOT SCENARIOS:
621+ # - Cluster-to-single: Always reboots after cleanup
622+ # - Single-to-cluster: Only non-bootstrap nodes may reboot if join fails (see check_cluster_transition_done) repeatedly
623+ # - Interrupted transition for non-bootstrap nodes: Reboots to single-node if enc_status_file disappears
547624check_cluster_config_change () {
548625
549626 # only check the cluster change when it's fully initialized
550627 if [ ! -f /var/lib/all_components_initialized ]; then
551628 return 0
552629 fi
553630
554- if [ ! -f " $enc_status_file " ]; then
555- # logmsg "EdgeNodeClusterStatus file not found"
631+ get_enc_status
632+ enc_status=$?
633+
634+ if [ $enc_status -eq 2 ]; then
635+ # the EdgeNodeClusterStatus file does not exist
556636 if [ ! -f /var/lib/edge-node-cluster-mode ]; then
557637 return 0
558638 else
@@ -566,15 +646,16 @@ check_cluster_config_change() {
566646 rm /var/lib/base-k3s-mode
567647 touch /var/lib/convert-to-single-node
568648 # We're transitioning from cluster mode to single node, so reboot is still needed
569- reboot
649+ reboot_with_reason " Transition from cluster mode to single node "
570650 fi
571- else
651+ elif [ -n " $cluster_token " ] && [ " $cluster_node_ip_is_ready " = " true " ] ; then
572652 # record we have seen this ENC status file
573653 if [ ! -f /var/lib/edge-node-cluster-mode ]; then
574654 logmsg " EdgeNodeClusterStatus file found, but the node does not have edge-node-cluster-mode"
575655 logmsg " *** check_cluster_config_change, before while loop. cluster_node_ip: $cluster_node_ip " # XXX
576656 while true ; do
577657 if get_enc_status; then
658+ # got the enc_status successfully, start single node to cluster transition
578659 logmsg " got the EdgeNodeClusterStatus successfully"
579660 # mark it cluster mode before changing the config file
580661 touch /var/lib/edge-node-cluster-mode
@@ -615,6 +696,17 @@ check_cluster_config_change() {
615696
616697 logmsg " provision config file for node to cluster mode"
617698 provision_cluster_config_file true
699+ provision_status=$?
700+
701+ # If in the middle of waiting for bootstrap node to be ready, the node is converted again to single node
702+ # we need to get out of this loop and go back to single node mode
703+ if [ $provision_status -eq 1 ]; then
704+ logmsg " EdgeNodeClusterStatus file disappeared, reset the status and back to single node and reboot"
705+ rm /var/lib/base-k3s-mode
706+ touch /var/lib/convert-to-single-node
707+ reboot_with_reason " EdgeNodeClusterStatus file disappeared during cluster join, revert to single node"
708+ fi
709+
618710 if [ " $is_bootstrap " = " false" ]; then
619711 # we got here because we know the bootstrap node is already running
620712 # For a non-bootstrap node, create transition file and record timestamp
@@ -634,10 +726,16 @@ check_cluster_config_change() {
634726 logmsg " WARNING: changing the node to cluster mode, k3s can restart"
635727 break
636728 else
729+ # In the case, check get_enc_status fails, and the EdgeNodeClusterStatus file is removed
730+ # we need to exit the loop and try again
731+ if [ ! -f " $enc_status_file " ]; then
732+ logmsg " EdgeNodeClusterStatus file disappeared, exit the loop and try again"
733+ return 0
734+ fi
637735 sleep 10
638736 fi
639- done
640- else
737+ done # end of while true
738+ else # enc_status exists but not in all valid states
641739 return 0
642740 fi
643741 fi
@@ -673,7 +771,7 @@ check_cluster_transition_done() {
673771 fi
674772 fi
675773
676- # Check if we've been waiting too long (10 minutes)
774+ # Check if we've been waiting too long (5 minutes)
677775 # File format is "timestamp reboot_count"
678776 # Maximum reboot attempts is 3
679777 file_content=$( cat /var/lib/transition-to-cluster)
@@ -683,7 +781,7 @@ check_cluster_transition_done() {
683781 current_timestamp=$( date +%s)
684782 elapsed_time=$(( current_timestamp - transition_timestamp))
685783
686- if [ " $elapsed_time " -ge 600 ]; then # 10 minutes in seconds
784+ if [ " $elapsed_time " -ge 300 ]; then # 5 minutes in seconds
687785 logmsg " Cluster transition timeout: Been waiting for ${elapsed_time} seconds"
688786
689787 # Increment reboot counter
@@ -693,7 +791,7 @@ check_cluster_transition_done() {
693791 # Update timestamp and reboot count in the same file
694792 echo " $( date +%s) $reboot_count " > /var/lib/transition-to-cluster
695793 logmsg " Rebooting system to retry cluster transition (attempt $reboot_count of 3)..."
696- reboot
794+ reboot_with_reason " Reboot after retry cluster transition attempt $reboot_count "
697795 else
698796 logmsg " Maximum reboot attempts (3) reached. We will not reboot again."
699797 # We could consider adding some recovery action here
@@ -763,6 +861,9 @@ uninstall_components() {
763861}
764862
765863# provision the config.yaml and bootstrap-config.yaml for cluster node, passing $1 as k3s needs initializing
864+ # Return values:
865+ # 0 - Success: configuration completed successfully
866+ # 1 - enc_status_file file disappeared during bootstrap wait
766867provision_cluster_config_file () {
767868# prepare the config.yaml and bootstrap-config.yaml on node
768869bootstrapContent=$( cat << - EOF
@@ -823,17 +924,31 @@ EOF
823924 if curl --insecure --max-time 2 " https://$join_serverIP :6443" > /dev/null 2>&1 ; then
824925 # logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status"
825926 # if we are here, check the bootstrap server is single or cluster mode
927+ # cluster status is reported via http://<join_serverIP>:8080/status API and the result if successful is
928+ # cluster:<cluster-uuid>, we need to verify the cluster-uuid matches our cluster_uuid in case we are joining
929+ # a wrong cluster in duplicate cluster IP address
826930 if ! status=$( curl --max-time 2 -s " http://$join_serverIP :$clusterStatusPort /status" ) ; then
827931 if [ $(( counter % 30 )) -eq 1 ]; then
828932 logmsg " Attempt $counter : Failed to connect to the server. Waiting for 10 seconds..."
829933 fi
830- elif [ " $status " = " cluster" ]; then
831- logmsg " Server is in 'cluster' status. done"
832- rm " $CLUSTER_WAIT_FILE "
833- break
934+ elif echo " $status " | grep -q " ^cluster:" ; then
935+ # Extract the reported cluster UUID from the status
936+ reported_uuid=$( echo " $status " | cut -d' :' -f2)
937+
938+ # Validate the cluster UUID matches
939+ if [ " $reported_uuid " = " $cluster_uuid " ]; then
940+ logmsg " Server is in 'cluster' status with matching UUID: $cluster_uuid . Done"
941+ rm " $CLUSTER_WAIT_FILE "
942+ break
943+ else
944+ if [ $(( counter % 30 )) -eq 1 ]; then
945+ logmsg " WARNING: Cluster UUID mismatch, may have duplicate Cluster IP address! Our UUID: $cluster_uuid , Reported UUID: $reported_uuid "
946+ logmsg " Attempt $counter : Cluster UUID does not match. Waiting for 10 seconds..."
947+ fi
948+ fi
834949 else
835950 if [ $(( counter % 30 )) -eq 1 ]; then
836- logmsg " Attempt $counter : Server is not in 'cluster' status. Waiting for 10 seconds..."
951+ logmsg " Attempt $counter : Server is not in 'cluster' status (got: $status ) . Waiting for 10 seconds..."
837952 fi
838953 fi
839954 else
@@ -849,12 +964,18 @@ EOF
849964 logmsg " Attempt $counter : curl to Endpoint https://$join_serverIP :6443 failed (ping $join_serverIP : $ping_result , success=$ping_success_count , fail=$ping_fail_count ). Waiting for 10 seconds..."
850965 fi
851966 fi
967+ if [ ! -f " $enc_status_file " ]; then
968+ logmsg " EdgeNodeClusterStatus file disappeared, exit the loop query bootstrap status"
969+ rm " $CLUSTER_WAIT_FILE "
970+ return 1
971+ fi
852972 sleep 10
853973 done
854974 else
855975 logmsg " restart case with k3s already installed, no need to wait"
856976 fi
857977 fi
978+ return 0
858979}
859980
860981DATESTR=$( date)
0 commit comments