2 nodes cluster scenario can end up with cycling replication on the master #464

svaroqui · svaroqui · commit 7d7b623bb762 · 2023-01-13T14:00:18.000+01:00
diff --git a/cluster/cluster_chk.go b/cluster/cluster_chk.go
@@ -256,8 +256,15 @@ func (cluster *Cluster) isMaxscaleSupectRunning() bool {
 }
 
 func (cluster *Cluster) isFoundCandidateMaster() bool {
-
-	key := cluster.electFailoverCandidate(cluster.slaves, false)
+	if cluster.GetTopology() == topoActivePassive {
+		return true
+	}
+	key := -1
+	if cluster.Conf.MultiMasterGrouprep {
+		key = cluster.electSwitchoverGroupReplicationCandidate(cluster.slaves, true)
+	} else {
+		key = cluster.electFailoverCandidate(cluster.slaves, false)
+	}
 	if key == -1 {
 		cluster.sme.AddState("ERR00032", state.State{ErrType: LvlErr, ErrDesc: fmt.Sprintf(clusterError["ERR00032"]), ErrFrom: "CHECK"})
 		return false
diff --git a/cluster/cluster_get.go b/cluster/cluster_get.go
@@ -500,6 +500,8 @@ func (cluster *Cluster) GetTopology() string {
 	} else if cluster.Conf.MasterSlavePgLogical {
 		cluster.Conf.Topology = topoMasterSlavePgLog
 		cluster.IsPostgres = true
+	} else if cluster.Conf.ActivePassive {
+		cluster.Conf.Topology = topoActivePassive
 	} else {
 		relay := cluster.GetRelayServer()
 		if relay != nil && cluster.Conf.ReplicationNoRelay == false {
diff --git a/cluster/cluster_has.go b/cluster/cluster_has.go
@@ -166,6 +166,29 @@ func (cluster *Cluster) HasAllDbUp() bool {
 	return true
 }
 
+func (cluster *Cluster) HasNoDbUnconnected() bool {
+	if cluster.Servers == nil {
+		return false
+	}
+	for _, s := range cluster.Servers {
+		if s != nil {
+			if s.State == stateFailed || s.State == stateUnconn /*&& misc.Contains(cluster.ignoreList, s.URL) == false*/ {
+				return false
+			}
+			if s.State == stateSuspect && cluster.GetTopology() != topoUnknown {
+				//supect is used to reload config and avoid backend state change to failed that would disable servers in proxies and cause glinch in cluster traffic
+				// at the same time to enbale bootstrap replication we need to know when server are up
+				return false
+			}
+			if s.Conn == nil {
+				return false
+			}
+		}
+	}
+
+	return true
+}
+
 func (cluster *Cluster) HasRequestDBRestart() bool {
 	if cluster.Servers == nil {
 		return false
diff --git a/cluster/cluster_topo.go b/cluster/cluster_topo.go
@@ -32,6 +32,7 @@ const (
 	topoMultiMasterWsrep    string = "multi-master-wsrep"
 	topoMasterSlavePgLog    string = "master-slave-pg-logical"
 	topoMasterSlavePgStream string = "master-slave-pg-stream"
+	topoActivePassive       string = "active-passive"
 )
 
 func (cluster *Cluster) newServerList() error {
@@ -169,10 +170,23 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
 				cluster.LogPrintf(LvlDbg, "Server %s is configured as a slave", sv.URL)
 			}
 			cluster.slaves = append(cluster.slaves, sv)
+<<<<<<< HEAD
 		} else {
 			// not slave
 
 			if sv.BinlogDumpThreads == 0 && sv.State != stateMaster {
+=======
+		} else { // not slave
+			if sv.IsGroupReplicationMaster {
+				cluster.master = cluster.Servers[k]
+				cluster.vmaster = cluster.Servers[k]
+				cluster.master.SetMaster()
+				if cluster.master.IsReadOnly() {
+					cluster.master.SetReadWrite()
+					cluster.LogPrintf(LvlInfo, "Group replication server %s disable read only ", cluster.master.URL)
+				}
+			} else if sv.BinlogDumpThreads == 0 && sv.State != stateMaster {
+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
 				//sv.State = stateUnconn
 				//transition to standalone may happen despite server have never connect successfully when default to suspect
 				if cluster.Conf.LogLevel > 2 {
@@ -187,6 +201,15 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
 					cluster.SetState("ERR00063", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00063"]), ErrFrom: "TOPO"})
 					//	cluster.Servers[k].RejoinMaster() /* remove for rolling restart , wrongly rejoin server as master before just after swithover while the server is just stopping */
 				} else {
+<<<<<<< HEAD
+=======
+					if cluster.Conf.LogLevel > 2 {
+						cluster.LogPrintf(LvlDbg, "Server %s was set master as last non slave", sv.URL)
+					}
+					if len(cluster.Servers) == 1 {
+						cluster.Conf.ActivePassive = true
+					}
+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
 					cluster.master = cluster.Servers[k]
 					cluster.master.SetMaster()
 					if cluster.master.IsReadOnly() && !cluster.master.IsRelay {
@@ -196,12 +219,15 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
 				}
 			}
 
-		}
-		// end not slave
-	}
+		} // end not slave
+	} //end loop all servers
 
 	// If no cluster.slaves are detected, generate an error
+<<<<<<< HEAD
 	if len(cluster.slaves) == 0 && cluster.GetTopology() != topoMultiMasterWsrep {
+=======
+	if len(cluster.slaves) == 0 && cluster.GetTopology() != topoMultiMasterWsrep && cluster.GetTopology() != topoMultiMasterGrouprep && cluster.GetTopology() != topoActivePassive {
+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
 		cluster.SetState("ERR00010", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00010"]), ErrFrom: "TOPO"})
 	}
 
@@ -309,6 +335,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
 			}
 		}
 	}
+
 	// Final check if master has been found
 	if cluster.master == nil {
 		// could not detect master
@@ -361,7 +388,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
 	}
 
 	if cluster.HasAllDbUp() {
-		if len(cluster.Crashes) > 0 {
+		if len(cluster.Crashes) > 0 && cluster.HasNoDbUnconnected() {
 			cluster.LogPrintf(LvlDbg, "Purging crashes, all databses nodes up")
 			cluster.Crashes = nil
 			cluster.Save()
@@ -409,12 +436,15 @@ func (cluster *Cluster) TopologyClusterDown() bool {
 		}
 		if allslavefailed {
 			if cluster.IsDiscovered() {
-				if cluster.master != nil && cluster.Conf.Interactive == false && cluster.Conf.FailRestartUnsafe == false {
-					// forget the master if safe mode
-					//		cluster.LogPrintf(LvlInfo, "Backing up last seen master: %s for safe failover restart", cluster.master.URL)
-					//		cluster.lastmaster = cluster.master
-					//		cluster.master = nil
-
+				if cluster.master != nil {
+					cluster.lastmaster = cluster.master
+					cluster.LogPrintf(LvlInfo, "Backing up last seen master: %s for safe failover restart", cluster.master.URL)
+
+					if cluster.Conf.FailRestartUnsafe == false {
+						// forget the master if safe mode
+						cluster.LogPrintf(LvlInfo, "Forget the leader as no more slave and failover unsafe is disable: %s ", cluster.master.URL)
+						cluster.master = nil
+					}
 				}
 			}
 			cluster.SetState("ERR00021", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00021"]), ErrFrom: "TOPO"})
diff --git a/cluster/srv.go b/cluster/srv.go
@@ -254,9 +254,10 @@ func (cluster *Cluster) newServerMonitor(url string, user string, pass string, c
 	server.IsRelay = false
 	server.IsMaxscale = true
 	server.IsDelayed = server.IsInDelayedHost()
-	server.SetState(stateSuspect)
 	// NOTE: does this make sense to set the state to the same?
 	server.SetPrevState(stateSuspect)
+	server.SetState(stateSuspect)
+
 	server.Datadir = server.ClusterGroup.Conf.WorkingDir + "/" + server.ClusterGroup.Name + "/" + server.Host + "_" + server.Port
 	if _, err := os.Stat(server.Datadir); os.IsNotExist(err) {
 		os.MkdirAll(server.Datadir, os.ModePerm)
diff --git a/cluster/srv_rejoin.go b/cluster/srv_rejoin.go
@@ -90,6 +90,7 @@ func (server *ServerMonitor) RejoinMaster() error {
 					server.ClusterGroup.LogPrintf("ERROR", "State transfer rejoin failed")
 				}
 			}
+<<<<<<< HEAD
 			if server.ClusterGroup.Conf.AutorejoinBackupBinlog == true {
 				server.saveBinlog(crash)
 			}
@@ -118,6 +119,35 @@ func (server *ServerMonitor) RejoinMaster() error {
 		}
 		// if consul or internal proxy need to adapt read only route to new slaves
 		server.ClusterGroup.backendStateChangeProxies()
+=======
+
+			// if consul or internal proxy need to adapt read only route to new slaves
+			server.ClusterGroup.backendStateChangeProxies()
+		}
+	} else {
+		//no master discovered rediscovering from last seen
+		if server.ClusterGroup.lastmaster != nil {
+			if server.ClusterGroup.lastmaster.ServerID == server.ServerID {
+				server.ClusterGroup.LogPrintf("INFO", "Rediscovering same master from last seen master: %s", server.URL)
+				server.ClusterGroup.master = server
+				server.SetMaster()
+				server.SetReadWrite()
+				server.ClusterGroup.lastmaster = nil
+			} else {
+				if server.ClusterGroup.Conf.FailRestartUnsafe == false {
+					server.ClusterGroup.LogPrintf("INFO", "Rediscovering not the master from last seen master: %s", server.URL)
+					server.rejoinMasterAsSlave()
+					// if consul or internal proxy need to adapt read only route to new slaves
+					server.ClusterGroup.backendStateChangeProxies()
+				} else {
+					server.ClusterGroup.LogPrintf("INFO", "Rediscovering unsafe possibly electing old leader after cascading failure to flavor availability: %s", server.URL)
+					server.ClusterGroup.master = server
+				}
+			}
+
+		} // we have last seen master
+
+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
 	}
 	return nil
 }
diff --git a/cluster/srv_set.go b/cluster/srv_set.go
@@ -57,7 +57,14 @@ func (server *ServerMonitor) SetState(state string) {
 }
 
 func (server *ServerMonitor) SetPrevState(state string) {
+<<<<<<< HEAD
 	server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state changed to: %s", server.URL, state)
+=======
+	if state == "" {
+		return
+	}
+	server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state set to: %s", server.URL, state)
+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
 	server.PrevState = state
 }
 
diff --git a/config/config.go b/config/config.go
@@ -110,6 +110,7 @@ type Config struct {
 	ReplicationErrorScript                    string `mapstructure:"replication-error-script" toml:"replication-error-script" json:"replicationErrorScript"`
 	MasterConn                                string `mapstructure:"replication-source-name" toml:"replication-source-name" json:"replicationSourceName"`
 	ReplicationSSL                            bool   `mapstructure:"replication-use-ssl" toml:"replication-use-ssl" json:"replicationUseSsl"`
+	ActivePassive                             bool   `mapstructure:"replication-active-passive" toml:"replication-active-passive" json:"replicationActivePassive"`
 	MultiMasterRing                           bool   `mapstructure:"replication-multi-master-ring" toml:"replication-multi-master-ring" json:"replicationMultiMasterRing"`
 	MultiMasterWsrep                          bool   `mapstructure:"replication-multi-master-wsrep" toml:"replication-multi-master-wsrep" json:"replicationMultiMasterWsrep"`
 	MultiMasterWsrepSSTMethod                 string `mapstructure:"replication-multi-master-wsrep-sst-method" toml:"replication-multi-master-wsrep-sst-method" json:"replicationMultiMasterWsrepSSTMethod"`
diff --git a/server/server_monitor.go b/server/server_monitor.go
@@ -127,6 +127,7 @@ func init() {
 	monitorCmd.Flags().IntVar(&conf.MasterConnectRetry, "replication-master-connect-retry", 10, "Replication is define using this connection retry timeout")
 	monitorCmd.Flags().StringVar(&conf.RplUser, "replication-credential", "root:mariadb", "Replication user in the [user]:[password] format")
 	monitorCmd.Flags().BoolVar(&conf.ReplicationSSL, "replication-use-ssl", false, "Replication use SSL encryption to replicate from master")
+	monitorCmd.Flags().BoolVar(&conf.ActivePassive, "replication-active-passive", false, "Active Passive topology")
 	monitorCmd.Flags().BoolVar(&conf.MultiMaster, "replication-multi-master", false, "Multi-master topology")
 	monitorCmd.Flags().BoolVar(&conf.MultiMasterWsrep, "replication-multi-master-wsrep", false, "Enable Galera multi-master")
 	monitorCmd.Flags().StringVar(&conf.MultiMasterWsrepSSTMethod, "replication-multi-master-wsrep-sst-method", "mariabackup", "mariabackup|xtrabackup-v2|rsync|mysqldump")

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,14 @@ func (server *ServerMonitor) SetState(state string) {`
`57`	`57`	`}`
`58`	`58`
`59`	`59`	`func (server *ServerMonitor) SetPrevState(state string) {`
	`60`	`+<<<<<<< HEAD`
`60`	`61`	`server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state changed to: %s", server.URL, state)`
	`62`	`+=======`
	`63`	`+ if state == "" {`
	`64`	`+ return`
	`65`	`+ }`
	`66`	`+ server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state set to: %s", server.URL, state)`
	`67`	`+>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464`
`61`	`68`	`server.PrevState = state`
`62`	`69`	`}`
`63`	`70`