Skip to content

Commit de131e3

Browse files
committed
Fixing case when switchover can run concurently from api call
Simplify failover code with defer to cancel failover state
1 parent 2aa9a3e commit de131e3

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

cluster/cluster_fail.go

+17-13
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
3131
res := cluster.VMasterFailover(fail)
3232
return res
3333
}
34+
if cluster.IsInFailover() {
35+
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Cancel already in failover")
36+
return false
37+
}
38+
3439
cluster.StateMachine.SetFailoverState()
40+
defer cluster.StateMachine.RemoveFailoverState()
3541
// Phase 1: Cleanup and election
3642
var err error
3743
if fail == false {
@@ -51,7 +57,7 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
5157
cluster.LogSQL(logs, err, cluster.master.URL, "MasterFailover", LvlDbg, "CheckLongRunningWrites")
5258
if qt > 0 {
5359
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long updates running on master. Cannot switchover")
54-
cluster.StateMachine.RemoveFailoverState()
60+
5561
return false
5662
}
5763

@@ -82,14 +88,12 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
8288
}
8389
case <-time.After(time.Second * time.Duration(cluster.Conf.SwitchWaitTrx)):
8490
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long running trx on master at least %d, can not switchover ", cluster.Conf.SwitchWaitTrx)
85-
cluster.StateMachine.RemoveFailoverState()
8691
return false
8792
}
8893

8994
} else {
9095
if cluster.Conf.MultiMasterGrouprep {
9196
// group replication auto elect a new master in case of failure do nothing
92-
cluster.StateMachine.RemoveFailoverState()
9397
return true
9498
}
9599
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "------------------------")
@@ -108,15 +112,13 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
108112
}
109113
if key == -1 {
110114
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "No candidates found")
111-
cluster.StateMachine.RemoveFailoverState()
112115
return false
113116
}
114117

115118
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Slave %s has been elected as a new master", cluster.slaves[key].URL)
116119

117120
if fail && !cluster.isSlaveElectable(cluster.slaves[key], true) {
118121
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Elected slave have issue cancelling failover", cluster.slaves[key].URL)
119-
cluster.StateMachine.RemoveFailoverState()
120122
return false
121123
}
122124
// Shuffle the server list
@@ -533,16 +535,16 @@ func (cluster *Cluster) MasterFailover(fail bool) bool {
533535
cluster.FailoverCtr++
534536
cluster.FailoverTs = time.Now().Unix()
535537
}
536-
cluster.StateMachine.RemoveFailoverState()
537538

538539
// Not a prefered master this code is not default
539-
if cluster.Conf.FailoverSwitchToPrefered && fail == true && cluster.Conf.PrefMaster != "" && !cluster.master.IsPrefered() {
540+
// such code is to dangerous documentation is needed
541+
/* if cluster.Conf.FailoverSwitchToPrefered && fail == true && cluster.Conf.PrefMaster != "" && !cluster.master.IsPrefered() {
540542
prm := cluster.foundPreferedMaster(cluster.slaves)
541543
if prm != nil {
542544
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Switchover after failover not on a prefered leader after failover")
543545
cluster.MasterFailover(false)
544546
}
545-
}
547+
}*/
546548

547549
return true
548550
}
@@ -1192,8 +1194,13 @@ func (cluster *Cluster) foundPreferedMaster(l []*ServerMonitor) *ServerMonitor {
11921194

11931195
// VMasterFailover triggers a leader change and returns the new master URL when all possible leader multimaster ring or galera
11941196
func (cluster *Cluster) VMasterFailover(fail bool) bool {
1197+
if cluster.IsInFailover() {
1198+
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Cancel already in failover")
1199+
return false
1200+
}
11951201

11961202
cluster.StateMachine.SetFailoverState()
1203+
defer cluster.StateMachine.RemoveFailoverState()
11971204
// Phase 1: Cleanup and election
11981205
var err error
11991206
cluster.oldMaster = cluster.vmaster
@@ -1214,7 +1221,7 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
12141221
cluster.LogSQL(logs, err, cluster.vmaster.URL, "MasterFailover", LvlDbg, "CheckLongRunningWrites")
12151222
if qt > 0 {
12161223
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long updates running on virtual master. Cannot switchover")
1217-
cluster.StateMachine.RemoveFailoverState()
1224+
12181225
return false
12191226
}
12201227

@@ -1235,7 +1242,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
12351242
}
12361243
case <-time.After(time.Second * time.Duration(cluster.Conf.SwitchWaitTrx)):
12371244
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "Long running trx on master at least %d, can not switchover ", cluster.Conf.SwitchWaitTrx)
1238-
cluster.StateMachine.RemoveFailoverState()
12391245
return false
12401246
}
12411247
cluster.master = cluster.vmaster
@@ -1262,7 +1268,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
12621268
}
12631269
if key == -1 {
12641270
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlErr, "No candidates found")
1265-
cluster.StateMachine.RemoveFailoverState()
12661271
return false
12671272
}
12681273
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Server %s has been elected as a new master", cluster.slaves[key].URL)
@@ -1287,7 +1292,7 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
12871292
}
12881293
if !fail && cluster.Conf.MultiMasterGrouprep {
12891294
result, errswitch := cluster.slaves[key].SetGroupReplicationPrimary()
1290-
cluster.StateMachine.RemoveFailoverState()
1295+
12911296
if errswitch == nil {
12921297
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, LvlInfo, "Server %s elected as new leader %s", cluster.slaves[key].URL, result)
12931298

@@ -1413,7 +1418,6 @@ func (cluster *Cluster) VMasterFailover(fail bool) bool {
14131418
}
14141419
cluster.master = nil
14151420

1416-
cluster.StateMachine.RemoveFailoverState()
14171421
return true
14181422
}
14191423

0 commit comments

Comments
 (0)