Skip to content

Commit 0b9f160

Browse files
committed
Fixing multi-master_wsrep taht set all nodes to readonly
1 parent 774fde2 commit 0b9f160

11 files changed

+104
-54
lines changed

cluster/cluster.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,10 @@ type Cluster struct {
112112
hostList []string `json:"-"`
113113
proxyList []string `json:"-"`
114114
clusterList map[string]*Cluster `json:"-"`
115-
slaves serverList `json:"-"`
116-
master *ServerMonitor `json:"-"`
117-
oldMaster *ServerMonitor `json:"-"`
118-
vmaster *ServerMonitor `json:"-"`
115+
slaves serverList `json:"slaves"`
116+
master *ServerMonitor `json:"master"`
117+
oldMaster *ServerMonitor `json:"oldmaster"`
118+
vmaster *ServerMonitor `json:"vmaster"`
119119
mxs *maxscale.MaxScale `json:"-"`
120120
dbUser string `json:"-"`
121121
dbPass string `json:"-"`

cluster/cluster_has.go

+7
Original file line numberDiff line numberDiff line change
@@ -289,3 +289,10 @@ func (cluster *Cluster) IsInFailover() bool {
289289
func (cluster *Cluster) IsDiscovered() bool {
290290
return cluster.sme.IsDiscovered()
291291
}
292+
293+
func (cluster *Cluster) IsMultiMaster() bool {
294+
if cluster.GetTopology() != topoMultiMasterWsrep || cluster.GetTopology() != topoMultiMaster || cluster.GetTopology() != topoMultiMasterRing {
295+
return true
296+
}
297+
return false
298+
}

cluster/cluster_wait.go

+1-25
Original file line numberDiff line numberDiff line change
@@ -185,31 +185,7 @@ func (cluster *Cluster) WaitMariaDBStop(server *ServerMonitor) error {
185185
}
186186

187187
func (cluster *Cluster) WaitDatabaseStart(server *ServerMonitor) error {
188-
exitloop := 0
189-
cluster.LogPrintf(LvlInfo, "Waiting database start on %s", server.URL)
190-
ticker := time.NewTicker(time.Millisecond * time.Duration(cluster.Conf.MonitoringTicker*1000))
191-
for int64(exitloop) < cluster.Conf.MonitorWaitRetry {
192-
select {
193-
case <-ticker.C:
194-
195-
exitloop++
196-
197-
err := server.Refresh()
198-
if err == nil {
199-
200-
exitloop = 9999999
201-
} else {
202-
cluster.LogPrintf(LvlInfo, "Waiting state running on %s failed with error %s ", server.URL, err)
203-
}
204-
}
205-
}
206-
if exitloop == 9999999 {
207-
cluster.LogPrintf(LvlInfo, "Waiting state running reach on %s", server.URL)
208-
} else {
209-
cluster.LogPrintf(LvlErr, "Wait state running on %s", server.URL)
210-
return errors.New("Failed to wait running database server")
211-
}
212-
return nil
188+
return server.WaitDatabaseStart()
213189
}
214190

215191
func (cluster *Cluster) WaitDatabaseSuspect(server *ServerMonitor) error {

cluster/prov.go

+18-13
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ func (cluster *Cluster) ProvisionServices() error {
6363
os.Remove(path)
6464
cluster.ResetCrashes()
6565
for _, server := range cluster.Servers {
66-
switch cluster.GetOrchestrator() {
66+
67+
switch cluster.GetOrchestrator() {
6768
case config.ConstOrchestratorOpenSVC:
6869
go cluster.OpenSVCProvisionDatabaseService(server)
6970
case config.ConstOrchestratorKubernetes:
@@ -79,6 +80,10 @@ func (cluster *Cluster) ProvisionServices() error {
7980

8081
}
8182
cluster.ProvisionDatabaseScript(server)
83+
if cluster.GetConf().ProvSerialized {
84+
server.WaitDatabaseStart()
85+
}
86+
8287
}
8388

8489
for _, server := range cluster.Servers {
@@ -96,7 +101,7 @@ func (cluster *Cluster) ProvisionServices() error {
96101
}
97102

98103
for _, prx := range cluster.Proxies {
99-
switch cluster.GetOrchestrator() {
104+
switch cluster.GetOrchestrator() {
100105
case config.ConstOrchestratorOpenSVC:
101106
go cluster.OpenSVCProvisionProxyService(prx)
102107
case config.ConstOrchestratorKubernetes:
@@ -136,7 +141,7 @@ func (cluster *Cluster) ProvisionServices() error {
136141

137142
func (cluster *Cluster) InitDatabaseService(server *ServerMonitor) error {
138143
cluster.sme.SetFailoverState()
139-
switch cluster.GetOrchestrator() {
144+
switch cluster.GetOrchestrator() {
140145
case config.ConstOrchestratorOpenSVC:
141146
go cluster.OpenSVCProvisionDatabaseService(server)
142147
case config.ConstOrchestratorKubernetes:
@@ -166,7 +171,7 @@ func (cluster *Cluster) InitDatabaseService(server *ServerMonitor) error {
166171
}
167172

168173
func (cluster *Cluster) InitProxyService(prx DatabaseProxy) error {
169-
switch cluster.GetOrchestrator() {
174+
switch cluster.GetOrchestrator() {
170175
case config.ConstOrchestratorOpenSVC:
171176
go cluster.OpenSVCProvisionProxyService(prx)
172177
case config.ConstOrchestratorKubernetes:
@@ -197,7 +202,7 @@ func (cluster *Cluster) Unprovision() error {
197202

198203
cluster.sme.SetFailoverState()
199204
for _, server := range cluster.Servers {
200-
switch cluster.GetOrchestrator() {
205+
switch cluster.GetOrchestrator() {
201206
case config.ConstOrchestratorOpenSVC:
202207
go cluster.OpenSVCUnprovisionDatabaseService(server)
203208
case config.ConstOrchestratorKubernetes:
@@ -236,7 +241,7 @@ func (cluster *Cluster) Unprovision() error {
236241
if !ok {
237242
continue
238243
}*/
239-
switch cluster.GetOrchestrator() {
244+
switch cluster.GetOrchestrator() {
240245
case config.ConstOrchestratorOpenSVC:
241246
go cluster.OpenSVCUnprovisionProxyService(prx)
242247
case config.ConstOrchestratorKubernetes:
@@ -270,7 +275,7 @@ func (cluster *Cluster) Unprovision() error {
270275
}
271276
}
272277
}
273-
switch cluster.GetOrchestrator() {
278+
switch cluster.GetOrchestrator() {
274279
case config.ConstOrchestratorOpenSVC:
275280
cluster.OpenSVCUnprovisionSecret()
276281
default:
@@ -280,7 +285,7 @@ func (cluster *Cluster) Unprovision() error {
280285
}
281286

282287
func (cluster *Cluster) UnprovisionProxyService(prx DatabaseProxy) error {
283-
switch cluster.GetOrchestrator() {
288+
switch cluster.GetOrchestrator() {
284289
case config.ConstOrchestratorOpenSVC:
285290
go cluster.OpenSVCUnprovisionProxyService(prx)
286291
case config.ConstOrchestratorKubernetes:
@@ -308,7 +313,7 @@ func (cluster *Cluster) UnprovisionProxyService(prx DatabaseProxy) error {
308313

309314
func (cluster *Cluster) UnprovisionDatabaseService(server *ServerMonitor) error {
310315
cluster.ResetCrashes()
311-
switch cluster.GetOrchestrator() {
316+
switch cluster.GetOrchestrator() {
312317
case config.ConstOrchestratorOpenSVC:
313318
go cluster.OpenSVCUnprovisionDatabaseService(server)
314319
case config.ConstOrchestratorKubernetes:
@@ -342,7 +347,7 @@ func (cluster *Cluster) StopDatabaseService(server *ServerMonitor) error {
342347
cluster.LogPrintf(LvlInfo, "Stopping database service %s", cluster.Name+"/svc/"+server.URL)
343348
var err error
344349

345-
switch cluster.GetOrchestrator() {
350+
switch cluster.GetOrchestrator() {
346351
case config.ConstOrchestratorOpenSVC:
347352
err = cluster.OpenSVCStopDatabaseService(server)
348353
case config.ConstOrchestratorKubernetes:
@@ -367,7 +372,7 @@ func (cluster *Cluster) StopProxyService(server DatabaseProxy) error {
367372
cluster.LogPrintf(LvlInfo, "Stopping Proxy service %s", cluster.Name+"/svc/"+server.GetName())
368373
var err error
369374

370-
switch cluster.GetOrchestrator() {
375+
switch cluster.GetOrchestrator() {
371376
case config.ConstOrchestratorOpenSVC:
372377
err = cluster.OpenSVCStopProxyService(server)
373378
case config.ConstOrchestratorKubernetes:
@@ -391,7 +396,7 @@ func (cluster *Cluster) StopProxyService(server DatabaseProxy) error {
391396
func (cluster *Cluster) StartProxyService(server DatabaseProxy) error {
392397
cluster.LogPrintf(LvlInfo, "Starting Proxy service %s", cluster.Name+"/svc/"+server.GetName())
393398
var err error
394-
switch cluster.GetOrchestrator() {
399+
switch cluster.GetOrchestrator() {
395400
case config.ConstOrchestratorOpenSVC:
396401
err = cluster.OpenSVCStartProxyService(server)
397402
case config.ConstOrchestratorKubernetes:
@@ -421,7 +426,7 @@ func (cluster *Cluster) ShutdownDatabase(server *ServerMonitor) error {
421426
func (cluster *Cluster) StartDatabaseService(server *ServerMonitor) error {
422427
cluster.LogPrintf(LvlInfo, "Starting Database service %s", cluster.Name+"/svc/"+server.Name)
423428
var err error
424-
switch cluster.GetOrchestrator() {
429+
switch cluster.GetOrchestrator() {
425430
case config.ConstOrchestratorOpenSVC:
426431
err = cluster.OpenSVCStartDatabaseService(server)
427432
case config.ConstOrchestratorKubernetes:

cluster/prx_mariadbshardproxy.go

+6
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ func (cluster *Cluster) CheckMdbShardServersSchema(proxy *MariadbShardProxy) {
143143
if cluster.master == nil {
144144
return
145145
}
146+
if proxy.ShardProxy.Conn == nil {
147+
return
148+
}
146149
schemas, _, err := cluster.master.GetSchemas()
147150
if err != nil {
148151
cluster.sme.AddState("WARN0089", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0089"], cluster.master.URL), ErrFrom: "PROXY", ServerUrl: cluster.master.URL})
@@ -300,6 +303,9 @@ func (cluster *Cluster) ShardProxyGetHeadCluster() *Cluster {
300303
}
301304

302305
func (cluster *Cluster) ShardProxyCreateVTable(proxy *MariadbShardProxy, schema string, table string, duplicates []*ServerMonitor, withreshard bool) error {
306+
if proxy.ShardProxy.Conn == nil {
307+
return errors.New("Shard Proxy not yet defined")
308+
}
303309
checksum64 := crc64.Checksum([]byte(schema+"_"+cluster.GetName()), cluster.crcTable)
304310
var err error
305311
var ddl string

cluster/srv.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -452,11 +452,12 @@ func (server *ServerMonitor) Ping(wg *sync.WaitGroup) {
452452
}
453453
if errss == sql.ErrNoRows || noChannel {
454454
// If we reached this stage with a previously failed server, reintroduce
455-
// it as unconnected server.
455+
// it as unconnected server.master
456456
if server.PrevState == stateFailed || server.PrevState == stateErrorAuth /*|| server.PrevState == stateSuspect*/ {
457457
server.ClusterGroup.LogPrintf(LvlDbg, "State comparison reinitialized failed server %s as unconnected", server.URL)
458458
if server.ClusterGroup.Conf.ReadOnly && server.HaveWsrep == false && server.ClusterGroup.IsDiscovered() {
459-
if server.ClusterGroup.master != nil {
459+
//GetMaster abstract master for galera multi master and master slave
460+
if server.GetCluster().GetMaster() != nil {
460461
if server.ClusterGroup.Status == ConstMonitorActif && server.ClusterGroup.master.Id != server.Id && !server.ClusterGroup.IsInIgnoredReadonly(server) && !server.ClusterGroup.IsInFailover() {
461462
server.ClusterGroup.LogPrintf(LvlInfo, "Setting Read Only on unconnected server %s as active monitor and other master is discovered", server.URL)
462463
server.SetReadOnly()
@@ -466,9 +467,9 @@ func (server *ServerMonitor) Ping(wg *sync.WaitGroup) {
466467
}
467468
}
468469
}
469-
//if server.ClusterGroup.GetTopology() != topoMultiMasterWsrep {
470-
server.SetState(stateUnconn)
471-
//}
470+
if server.ClusterGroup.GetTopology() != topoMultiMasterWsrep {
471+
server.SetState(stateUnconn)
472+
}
472473
server.FailCount = 0
473474
server.ClusterGroup.backendStateChangeProxies()
474475
server.SendAlert()
@@ -480,10 +481,10 @@ func (server *ServerMonitor) Ping(wg *sync.WaitGroup) {
480481

481482
} else if server.State != stateMaster && server.PrevState != stateUnconn && server.State == stateUnconn {
482483
// Master will never get discovery in topology if it does not get unconnected first it default to suspect
483-
if server.ClusterGroup.GetTopology() != topoMultiMasterWsrep {
484-
server.SetState(stateUnconn)
485-
server.ClusterGroup.LogPrintf(LvlDbg, "State unconnected set by non-master rule on server %s", server.URL)
486-
}
484+
// if server.ClusterGroup.GetTopology() != topoMultiMasterWsrep {
485+
server.SetState(stateUnconn)
486+
server.ClusterGroup.LogPrintf(LvlDbg, "State unconnected set by non-master rule on server %s", server.URL)
487+
// }
487488
if server.ClusterGroup.Conf.ReadOnly && server.HaveWsrep == false && server.ClusterGroup.IsDiscovered() && !server.ClusterGroup.IsInIgnoredReadonly(server) && !server.ClusterGroup.IsInFailover() {
488489
server.ClusterGroup.LogPrintf(LvlInfo, "Setting Read Only on unconnected server: %s no master state and replication found", server.URL)
489490
server.SetReadOnly()

cluster/srv_chk.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ func (server *ServerMonitor) CheckSlaveSettings() {
162162
} else if sl.IsIgnored() == false && sl.HaveBinlogRow == false && server.ClusterGroup.Conf.AutorejoinFlashback == true {
163163
server.ClusterGroup.sme.AddState("WARN0049", state.State{ErrType: LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0049"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL})
164164
}
165-
if server.ClusterGroup.Conf.ForceSlaveReadOnly && sl.ReadOnly == "OFF" && !server.ClusterGroup.IsInIgnoredReadonly(server) {
165+
if server.ClusterGroup.Conf.ForceSlaveReadOnly && sl.ReadOnly == "OFF" && !server.ClusterGroup.IsInIgnoredReadonly(server) && !server.ClusterGroup.IsMultiMaster() {
166166
// In non-multimaster mode, enforce read-only flag if the option is set
167167
sl.SetReadOnly()
168168
server.ClusterGroup.LogPrintf("INFO", "Enforce read only on slave %s", sl.URL)

cluster/srv_has.go

+15
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,21 @@ func (server *ServerMonitor) IsRunning() bool {
363363
return !server.IsDown()
364364
}
365365

366+
func (server *ServerMonitor) IsConnected() bool {
367+
if server.State == stateFailed /*&& misc.Contains(cluster.ignoreList, s.URL) == false*/ {
368+
return false
369+
}
370+
if server.State == stateSuspect && server.GetCluster().GetTopology() != topoUnknown {
371+
//supect is used to reload config and avoid backend state change to failed that would disable servers in proxies and cause glinch in cluster traffic
372+
// at the same time to enbale bootstrap replication we need to know when server are up
373+
return false
374+
}
375+
if server.Conn == nil {
376+
return false
377+
}
378+
return true
379+
}
380+
366381
// IsFailed() returns true is the server is Failed or auth error
367382
func (server *ServerMonitor) IsFailed() bool {
368383
if server.State == stateFailed || server.State == stateErrorAuth {

cluster/srv_wait.go

+40-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99

1010
package cluster
1111

12-
import "github.com/signal18/replication-manager/utils/dbhelper"
12+
import (
13+
"errors"
14+
"time"
15+
16+
"github.com/signal18/replication-manager/utils/dbhelper"
17+
)
1318

1419
func (server *ServerMonitor) WaitSyncToMaster(master *ServerMonitor) {
1520
server.ClusterGroup.LogPrintf(LvlInfo, "Waiting for slave %s to sync", server.URL)
@@ -26,3 +31,37 @@ func (server *ServerMonitor) WaitSyncToMaster(master *ServerMonitor) {
2631
server.LogReplPostion()
2732
}
2833
}
34+
35+
func (server *ServerMonitor) WaitDatabaseStart() error {
36+
exitloop := 0
37+
server.GetCluster().LogPrintf(LvlInfo, "Waiting database start on %s", server.URL)
38+
ticker := time.NewTicker(time.Millisecond * time.Duration(server.GetCluster().GetConf().MonitoringTicker*1000))
39+
for int64(exitloop) < server.GetCluster().GetConf().MonitorWaitRetry {
40+
select {
41+
case <-ticker.C:
42+
43+
exitloop++
44+
var err error
45+
if server.GetCluster().GetTopology() == topoMultiMasterWsrep {
46+
if !server.IsConnected() {
47+
err = errors.New("Not yet connected")
48+
}
49+
} else {
50+
err = server.Refresh()
51+
}
52+
if err == nil {
53+
54+
exitloop = 9999999
55+
} else {
56+
server.GetCluster().LogPrintf(LvlInfo, "Waiting state running on %s failed with error %s ", server.URL, err)
57+
}
58+
}
59+
}
60+
if exitloop == 9999999 {
61+
server.GetCluster().LogPrintf(LvlInfo, "Waiting state running reach on %s", server.URL)
62+
} else {
63+
server.GetCluster().LogPrintf(LvlErr, "Wait state running on %s", server.URL)
64+
return errors.New("Failed to wait running database server")
65+
}
66+
return nil
67+
}

config/config.go

+1
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ type Config struct {
338338
ProvAdminUser string `mapstructure:"opensvc-admin-user" toml:"opensvc-admin-user" json:"opensvcAdminUser"`
339339
ProvUser string `mapstructure:"opensvc-user" toml:"opensvc-user" json:"opensvcUser"`
340340
ProvCodeApp string `mapstructure:"opensvc-codeapp" toml:"opensvc-codeapp" json:"opensvcCodeapp"`
341+
ProvSerialized bool `mapstructure:"prov-serialized" toml:"prov-serialized" json:"provSerialized"`
341342
ProvOrchestrator string `mapstructure:"prov-orchestrator" toml:"prov-orchestrator" json:"provOrchestrator"`
342343
ProvOrchestratorEnable string `mapstructure:"prov-orchestrator-enable" toml:"prov-orchestrator-enable" json:"provOrchestratorEnable"`
343344
ProvOrchestratorCluster string `mapstructure:"prov-orchestrator-cluster" toml:"prov-orchestrator-cluster" json:"provOrchestratorCluster"`

server/server_monitor.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ func init() {
403403
monitorCmd.Flags().StringVar(&conf.SysbenchBinaryPath, "sysbench-binary-path", "/usr/bin/sysbench", "Sysbench Wrapper in test mode")
404404
monitorCmd.Flags().StringVar(&conf.ProvDBBinaryBasedir, "prov-db-binary-basedir", "/usr/local/mysql/bin", "Path to mysqld binary")
405405
monitorCmd.Flags().StringVar(&conf.ProvDBClientBasedir, "prov-db-client-basedir", "/usr/bin", "Path to database client binary")
406-
406+
monitorCmd.Flags().BoolVar(&conf.ProvSerialized, "prov-serialized", false, "Disable concurrent provisionning")
407407
if WithOpenSVC == "ON" {
408408
monitorCmd.Flags().StringVar(&conf.ProvOrchestratorEnable, "prov-orchestrator-enable", "opensvc,kube,onpremise,local", "seprated list of orchestrator ")
409409
monitorCmd.Flags().StringVar(&conf.ProvOrchestrator, "prov-orchestrator", "opensvc", "onpremise|opensvc|kube|slapos|local")

0 commit comments

Comments
 (0)