Skip to content

Commit 7d7b623

Browse files
committed
2 nodes cluster scenario can end up with cycling replication on the master #464
1 parent 55ed1ce commit 7d7b623

File tree

9 files changed

+115
-13
lines changed

9 files changed

+115
-13
lines changed

cluster/cluster_chk.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,15 @@ func (cluster *Cluster) isMaxscaleSupectRunning() bool {
256256
}
257257

258258
func (cluster *Cluster) isFoundCandidateMaster() bool {
259-
260-
key := cluster.electFailoverCandidate(cluster.slaves, false)
259+
if cluster.GetTopology() == topoActivePassive {
260+
return true
261+
}
262+
key := -1
263+
if cluster.Conf.MultiMasterGrouprep {
264+
key = cluster.electSwitchoverGroupReplicationCandidate(cluster.slaves, true)
265+
} else {
266+
key = cluster.electFailoverCandidate(cluster.slaves, false)
267+
}
261268
if key == -1 {
262269
cluster.sme.AddState("ERR00032", state.State{ErrType: LvlErr, ErrDesc: fmt.Sprintf(clusterError["ERR00032"]), ErrFrom: "CHECK"})
263270
return false

cluster/cluster_get.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,8 @@ func (cluster *Cluster) GetTopology() string {
500500
} else if cluster.Conf.MasterSlavePgLogical {
501501
cluster.Conf.Topology = topoMasterSlavePgLog
502502
cluster.IsPostgres = true
503+
} else if cluster.Conf.ActivePassive {
504+
cluster.Conf.Topology = topoActivePassive
503505
} else {
504506
relay := cluster.GetRelayServer()
505507
if relay != nil && cluster.Conf.ReplicationNoRelay == false {

cluster/cluster_has.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,29 @@ func (cluster *Cluster) HasAllDbUp() bool {
166166
return true
167167
}
168168

169+
func (cluster *Cluster) HasNoDbUnconnected() bool {
170+
if cluster.Servers == nil {
171+
return false
172+
}
173+
for _, s := range cluster.Servers {
174+
if s != nil {
175+
if s.State == stateFailed || s.State == stateUnconn /*&& misc.Contains(cluster.ignoreList, s.URL) == false*/ {
176+
return false
177+
}
178+
if s.State == stateSuspect && cluster.GetTopology() != topoUnknown {
179+
//supect is used to reload config and avoid backend state change to failed that would disable servers in proxies and cause glinch in cluster traffic
180+
// at the same time to enbale bootstrap replication we need to know when server are up
181+
return false
182+
}
183+
if s.Conn == nil {
184+
return false
185+
}
186+
}
187+
}
188+
189+
return true
190+
}
191+
169192
func (cluster *Cluster) HasRequestDBRestart() bool {
170193
if cluster.Servers == nil {
171194
return false

cluster/cluster_topo.go

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ const (
3232
topoMultiMasterWsrep string = "multi-master-wsrep"
3333
topoMasterSlavePgLog string = "master-slave-pg-logical"
3434
topoMasterSlavePgStream string = "master-slave-pg-stream"
35+
topoActivePassive string = "active-passive"
3536
)
3637

3738
func (cluster *Cluster) newServerList() error {
@@ -169,10 +170,23 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
169170
cluster.LogPrintf(LvlDbg, "Server %s is configured as a slave", sv.URL)
170171
}
171172
cluster.slaves = append(cluster.slaves, sv)
173+
<<<<<<< HEAD
172174
} else {
173175
// not slave
174176

175177
if sv.BinlogDumpThreads == 0 && sv.State != stateMaster {
178+
=======
179+
} else { // not slave
180+
if sv.IsGroupReplicationMaster {
181+
cluster.master = cluster.Servers[k]
182+
cluster.vmaster = cluster.Servers[k]
183+
cluster.master.SetMaster()
184+
if cluster.master.IsReadOnly() {
185+
cluster.master.SetReadWrite()
186+
cluster.LogPrintf(LvlInfo, "Group replication server %s disable read only ", cluster.master.URL)
187+
}
188+
} else if sv.BinlogDumpThreads == 0 && sv.State != stateMaster {
189+
>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
176190
//sv.State = stateUnconn
177191
//transition to standalone may happen despite server have never connect successfully when default to suspect
178192
if cluster.Conf.LogLevel > 2 {
@@ -187,6 +201,15 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
187201
cluster.SetState("ERR00063", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00063"]), ErrFrom: "TOPO"})
188202
// cluster.Servers[k].RejoinMaster() /* remove for rolling restart , wrongly rejoin server as master before just after swithover while the server is just stopping */
189203
} else {
204+
<<<<<<< HEAD
205+
=======
206+
if cluster.Conf.LogLevel > 2 {
207+
cluster.LogPrintf(LvlDbg, "Server %s was set master as last non slave", sv.URL)
208+
}
209+
if len(cluster.Servers) == 1 {
210+
cluster.Conf.ActivePassive = true
211+
}
212+
>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
190213
cluster.master = cluster.Servers[k]
191214
cluster.master.SetMaster()
192215
if cluster.master.IsReadOnly() && !cluster.master.IsRelay {
@@ -196,12 +219,15 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
196219
}
197220
}
198221

199-
}
200-
// end not slave
201-
}
222+
} // end not slave
223+
} //end loop all servers
202224

203225
// If no cluster.slaves are detected, generate an error
226+
<<<<<<< HEAD
204227
if len(cluster.slaves) == 0 && cluster.GetTopology() != topoMultiMasterWsrep {
228+
=======
229+
if len(cluster.slaves) == 0 && cluster.GetTopology() != topoMultiMasterWsrep && cluster.GetTopology() != topoMultiMasterGrouprep && cluster.GetTopology() != topoActivePassive {
230+
>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
205231
cluster.SetState("ERR00010", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00010"]), ErrFrom: "TOPO"})
206232
}
207233

@@ -309,6 +335,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
309335
}
310336
}
311337
}
338+
312339
// Final check if master has been found
313340
if cluster.master == nil {
314341
// could not detect master
@@ -361,7 +388,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error {
361388
}
362389

363390
if cluster.HasAllDbUp() {
364-
if len(cluster.Crashes) > 0 {
391+
if len(cluster.Crashes) > 0 && cluster.HasNoDbUnconnected() {
365392
cluster.LogPrintf(LvlDbg, "Purging crashes, all databses nodes up")
366393
cluster.Crashes = nil
367394
cluster.Save()
@@ -409,12 +436,15 @@ func (cluster *Cluster) TopologyClusterDown() bool {
409436
}
410437
if allslavefailed {
411438
if cluster.IsDiscovered() {
412-
if cluster.master != nil && cluster.Conf.Interactive == false && cluster.Conf.FailRestartUnsafe == false {
413-
// forget the master if safe mode
414-
// cluster.LogPrintf(LvlInfo, "Backing up last seen master: %s for safe failover restart", cluster.master.URL)
415-
// cluster.lastmaster = cluster.master
416-
// cluster.master = nil
417-
439+
if cluster.master != nil {
440+
cluster.lastmaster = cluster.master
441+
cluster.LogPrintf(LvlInfo, "Backing up last seen master: %s for safe failover restart", cluster.master.URL)
442+
443+
if cluster.Conf.FailRestartUnsafe == false {
444+
// forget the master if safe mode
445+
cluster.LogPrintf(LvlInfo, "Forget the leader as no more slave and failover unsafe is disable: %s ", cluster.master.URL)
446+
cluster.master = nil
447+
}
418448
}
419449
}
420450
cluster.SetState("ERR00021", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00021"]), ErrFrom: "TOPO"})

cluster/srv.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,10 @@ func (cluster *Cluster) newServerMonitor(url string, user string, pass string, c
254254
server.IsRelay = false
255255
server.IsMaxscale = true
256256
server.IsDelayed = server.IsInDelayedHost()
257-
server.SetState(stateSuspect)
258257
// NOTE: does this make sense to set the state to the same?
259258
server.SetPrevState(stateSuspect)
259+
server.SetState(stateSuspect)
260+
260261
server.Datadir = server.ClusterGroup.Conf.WorkingDir + "/" + server.ClusterGroup.Name + "/" + server.Host + "_" + server.Port
261262
if _, err := os.Stat(server.Datadir); os.IsNotExist(err) {
262263
os.MkdirAll(server.Datadir, os.ModePerm)

cluster/srv_rejoin.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func (server *ServerMonitor) RejoinMaster() error {
9090
server.ClusterGroup.LogPrintf("ERROR", "State transfer rejoin failed")
9191
}
9292
}
93+
<<<<<<< HEAD
9394
if server.ClusterGroup.Conf.AutorejoinBackupBinlog == true {
9495
server.saveBinlog(crash)
9596
}
@@ -118,6 +119,35 @@ func (server *ServerMonitor) RejoinMaster() error {
118119
}
119120
// if consul or internal proxy need to adapt read only route to new slaves
120121
server.ClusterGroup.backendStateChangeProxies()
122+
=======
123+
124+
// if consul or internal proxy need to adapt read only route to new slaves
125+
server.ClusterGroup.backendStateChangeProxies()
126+
}
127+
} else {
128+
//no master discovered rediscovering from last seen
129+
if server.ClusterGroup.lastmaster != nil {
130+
if server.ClusterGroup.lastmaster.ServerID == server.ServerID {
131+
server.ClusterGroup.LogPrintf("INFO", "Rediscovering same master from last seen master: %s", server.URL)
132+
server.ClusterGroup.master = server
133+
server.SetMaster()
134+
server.SetReadWrite()
135+
server.ClusterGroup.lastmaster = nil
136+
} else {
137+
if server.ClusterGroup.Conf.FailRestartUnsafe == false {
138+
server.ClusterGroup.LogPrintf("INFO", "Rediscovering not the master from last seen master: %s", server.URL)
139+
server.rejoinMasterAsSlave()
140+
// if consul or internal proxy need to adapt read only route to new slaves
141+
server.ClusterGroup.backendStateChangeProxies()
142+
} else {
143+
server.ClusterGroup.LogPrintf("INFO", "Rediscovering unsafe possibly electing old leader after cascading failure to flavor availability: %s", server.URL)
144+
server.ClusterGroup.master = server
145+
}
146+
}
147+
148+
} // we have last seen master
149+
150+
>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
121151
}
122152
return nil
123153
}

cluster/srv_set.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,14 @@ func (server *ServerMonitor) SetState(state string) {
5757
}
5858

5959
func (server *ServerMonitor) SetPrevState(state string) {
60+
<<<<<<< HEAD
6061
server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state changed to: %s", server.URL, state)
62+
=======
63+
if state == "" {
64+
return
65+
}
66+
server.ClusterGroup.LogPrintf(LvlInfo, "Server %s previous state set to: %s", server.URL, state)
67+
>>>>>>> bab5a650... 2 nodes cluster scenario can end up with cycling replication on the master #464
6168
server.PrevState = state
6269
}
6370

config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ type Config struct {
110110
ReplicationErrorScript string `mapstructure:"replication-error-script" toml:"replication-error-script" json:"replicationErrorScript"`
111111
MasterConn string `mapstructure:"replication-source-name" toml:"replication-source-name" json:"replicationSourceName"`
112112
ReplicationSSL bool `mapstructure:"replication-use-ssl" toml:"replication-use-ssl" json:"replicationUseSsl"`
113+
ActivePassive bool `mapstructure:"replication-active-passive" toml:"replication-active-passive" json:"replicationActivePassive"`
113114
MultiMasterRing bool `mapstructure:"replication-multi-master-ring" toml:"replication-multi-master-ring" json:"replicationMultiMasterRing"`
114115
MultiMasterWsrep bool `mapstructure:"replication-multi-master-wsrep" toml:"replication-multi-master-wsrep" json:"replicationMultiMasterWsrep"`
115116
MultiMasterWsrepSSTMethod string `mapstructure:"replication-multi-master-wsrep-sst-method" toml:"replication-multi-master-wsrep-sst-method" json:"replicationMultiMasterWsrepSSTMethod"`

server/server_monitor.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ func init() {
127127
monitorCmd.Flags().IntVar(&conf.MasterConnectRetry, "replication-master-connect-retry", 10, "Replication is define using this connection retry timeout")
128128
monitorCmd.Flags().StringVar(&conf.RplUser, "replication-credential", "root:mariadb", "Replication user in the [user]:[password] format")
129129
monitorCmd.Flags().BoolVar(&conf.ReplicationSSL, "replication-use-ssl", false, "Replication use SSL encryption to replicate from master")
130+
monitorCmd.Flags().BoolVar(&conf.ActivePassive, "replication-active-passive", false, "Active Passive topology")
130131
monitorCmd.Flags().BoolVar(&conf.MultiMaster, "replication-multi-master", false, "Multi-master topology")
131132
monitorCmd.Flags().BoolVar(&conf.MultiMasterWsrep, "replication-multi-master-wsrep", false, "Enable Galera multi-master")
132133
monitorCmd.Flags().StringVar(&conf.MultiMasterWsrepSSTMethod, "replication-multi-master-wsrep-sst-method", "mariabackup", "mariabackup|xtrabackup-v2|rsync|mysqldump")

0 commit comments

Comments
 (0)