@@ -10,6 +10,8 @@ import (
1010 "time"
1111
1212 "github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
13+ "github.com/couchbase/gocbcorex/cbhttpx"
14+ "github.com/couchbase/gocbcorex/cbmgmtx"
1315 "github.com/couchbaselabs/cbdinocluster/clusterdef"
1416 "github.com/couchbaselabs/cbdinocluster/utils/clustercontrol"
1517 "github.com/couchbaselabs/cbdinocluster/utils/dinocerts"
@@ -777,6 +779,68 @@ func (d *Deployer) addRemoveNodes(
777779 return nil , errors .Wrap (err , "failed to get post-rebalance cluster info" )
778780 }
779781
782+ // Wait until all nodes in the cluster acknowledge each other
783+ d .logger .Info ("waiting for all nodes to acknowledge each other" )
784+
785+ // First we fetch all the OTPs for nodes we know about in cbdinocluster
786+ thisClusterEx , err := d .getClusterInfoEx (ctx , thisCluster )
787+ if err != nil {
788+ return nil , errors .Wrap (err , "failed to get post-rebalance extended cluster info" )
789+ }
790+
791+ allNodeOtps := make ([]string , 0 , len (thisClusterEx .NodesEx ))
792+ for _ , node := range thisClusterEx .NodesEx {
793+ allNodeOtps = append (allNodeOtps , node .OTPNode )
794+ }
795+
796+ // Now we check that all nodes acknowledge all other nodes using OTPs to match
797+ for {
798+ allNodesAcked := true
799+
800+ for _ , node := range thisClusterEx .NodesEx {
801+ mgmt := & cbmgmtx.Management {
802+ Transport : http .DefaultTransport ,
803+ Endpoint : fmt .Sprintf ("http://%s:8091" , node .IPAddress ),
804+ Auth : cbhttpx.BasicAuth {
805+ Username : "Administrator" ,
806+ Password : "password" ,
807+ },
808+ }
809+
810+ nodeConfig , err := mgmt .GetClusterConfig (ctx , & cbmgmtx.GetClusterConfigOptions {})
811+ if err != nil {
812+ return nil , errors .Wrap (err , "failed to get cluster config from a node" )
813+ }
814+
815+ nodeNodeOtps := make ([]string , 0 , len (nodeConfig .Nodes ))
816+ for _ , nodeConfigNode := range nodeConfig .Nodes {
817+ nodeNodeOtps = append (nodeNodeOtps , nodeConfigNode .OTPNode )
818+ }
819+
820+ missingNodes := false
821+ if len (nodeNodeOtps ) != len (allNodeOtps ) {
822+ missingNodes = true
823+ }
824+ for _ , otp := range allNodeOtps {
825+ if ! slices .Contains (nodeNodeOtps , otp ) {
826+ missingNodes = true
827+ break
828+ }
829+ }
830+
831+ if missingNodes {
832+ allNodesAcked = false
833+ }
834+ }
835+
836+ if allNodesAcked {
837+ break
838+ }
839+
840+ d .logger .Info ("not all nodes have acknowledged each other... waiting" )
841+ time .Sleep (1 * time .Second )
842+ }
843+
780844 if len (nodesToAdd ) > 0 {
781845 // no need to update if DNS is disabled
782846 if thisCluster .DnsName != "" {
0 commit comments