Skip to content

Commit b26cbf7

Browse files
Merge pull request #1097 from signal18/1095-stop-maintenance-when-repman-disk-is-full
1095 stop maintenance when repman disk is full
2 parents 3966bbd + 033a79f commit b26cbf7

File tree

15 files changed

+500
-86
lines changed

15 files changed

+500
-86
lines changed

cluster/cluster.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ type Cluster struct {
242242
ConfigManager *manager.ConfigManager `json:"-"`
243243
failSendCount int `json:"-"`
244244
MeetUserID string `json:"-"` //To store meet user id
245+
DiskStatManager *misc.DiskStatManager `json:"diskStat"`
245246
LastDelayStatPrint time.Time
246247
sync.Mutex
247248
crcTable *crc64.Table
@@ -695,8 +696,9 @@ func (cluster *Cluster) Run() {
695696
go cluster.CheckCredentialRotation()
696697
cluster.CheckCanSaveDynamicConfig()
697698
cluster.CheckIsOverwrite()
699+
cluster.CheckAllBackupFreeSpace()
698700
} else {
699-
cluster.StateMachine.PreserveState("WARN0093", "WARN0084", "WARN0095", "WARN0101", "WARN0111", "WARN0112", "ERR00090", "WARN0102", "WARN0134")
701+
cluster.StateMachine.PreserveState("WARN0093", "WARN0084", "WARN0095", "WARN0101", "WARN0111", "WARN0112", "ERR00090", "WARN0102", "WARN0134", "WARN0139", "WARN0140", "WARN0141", "WARN0142", "WARN0143")
700702
}
701703
if !cluster.CanInitNodes {
702704
cluster.SetState("ERR00082", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00082"], cluster.errorInitNodes), ErrFrom: "OPENSVC"})

cluster/cluster_bck.go

+108
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,13 @@ package cluster
99
import (
1010
"fmt"
1111
"os"
12+
"sync"
1213

14+
"github.com/dustin/go-humanize"
15+
"github.com/shirou/gopsutil/disk"
1316
"github.com/signal18/replication-manager/config"
1417
"github.com/signal18/replication-manager/utils/archiver"
18+
"github.com/signal18/replication-manager/utils/dbhelper"
1519
"github.com/signal18/replication-manager/utils/state"
1620
"github.com/sirupsen/logrus"
1721
)
@@ -196,3 +200,107 @@ func (cluster *Cluster) ResticResetQueue() error {
196200

197201
return nil
198202
}
203+
204+
func (cluster *Cluster) CheckBackupFreeSpace(backtype string, backup bool) error {
205+
var isWarning bool
206+
bcksrv := cluster.GetBackupServer()
207+
if bcksrv == nil {
208+
bcksrv = cluster.master
209+
}
210+
211+
parentDir := cluster.Conf.WorkingDir + "/" + config.ConstStreamingSubDir + "/" + cluster.Name
212+
diskstat, err := disk.Usage(parentDir)
213+
if err != nil {
214+
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModTask, config.LvlErr, "Error getting disk usage: %s", err)
215+
return err
216+
}
217+
218+
cluster.DiskStatManager.UpdateStat(parentDir, diskstat)
219+
if diskstat.UsedPercent > float64(cluster.Conf.BackupDiskTresholdCrit) {
220+
cluster.SetState("WARN0140", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0140"], diskstat.Path, diskstat.UsedPercent, cluster.Conf.BackupDiskTresholdCrit), ErrFrom: "JOB", ServerUrl: bcksrv.URL})
221+
return fmt.Errorf("Disk usage is over %d%% on %s. Used: %s", cluster.Conf.BackupDiskTresholdCrit, diskstat.Path, humanize.Bytes(diskstat.Used))
222+
} else if diskstat.UsedPercent > float64(cluster.Conf.BackupDiskTresholdWarn) {
223+
isWarning = true
224+
cluster.SetState("WARN0139", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0139"], diskstat.Path, diskstat.UsedPercent, cluster.Conf.BackupDiskTresholdWarn), ErrFrom: "JOB", ServerUrl: bcksrv.URL})
225+
}
226+
227+
// Estimate size if disk usage is over treshold and estimate size is enabled. For binlog we will always estimate size to 2GB
228+
if (isWarning && cluster.Conf.BackupEstimateSize) || backtype == "binlog" {
229+
free := diskstat.Free
230+
required := uint64(0)
231+
232+
switch backtype {
233+
case "logical", "physical":
234+
_, prev := bcksrv.GetLatestMeta(backtype)
235+
if prev != nil && prev.Completed {
236+
required = uint64(prev.Size * int64(100+cluster.Conf.BackupGrowthPercentage) / 100)
237+
238+
// If not keep until valid, we need to add the size of the previous backup to the free space
239+
if !cluster.Conf.BackupKeepUntilValid {
240+
free = free + uint64(prev.Size)
241+
}
242+
243+
} else {
244+
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModTask, config.LvlInfo, "No previous backup found for %s. Estimating backup size.", bcksrv.URL)
245+
estimatedSize, err := dbhelper.GetBackupSizeEstimation(bcksrv.Conn, bcksrv.DBVersion)
246+
if err != nil {
247+
return fmt.Errorf("Error estimating backup size: %s", err)
248+
}
249+
250+
required = estimatedSize * uint64(cluster.Conf.BackupEstimateSizePercentage) / 100
251+
}
252+
case "binlog":
253+
// Max binlog size per file is 1GB, additional 1GB for unexpected growth
254+
required = 2 * 1024 * 1024 * 1024
255+
case "restic":
256+
// Restic backup size is not known until the backup is done
257+
}
258+
259+
if free < required {
260+
if backtype == "logical" {
261+
cluster.SetState("WARN0141", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0139"], cluster.Conf.BackupLogicalType, bcksrv.URL, diskstat.Path, humanize.Bytes(diskstat.Free), humanize.Bytes(required)), ErrFrom: "JOB", ServerUrl: bcksrv.URL})
262+
} else if backtype == "physical" {
263+
cluster.SetState("WARN0142", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0140"], cluster.Conf.BackupPhysicalType, bcksrv.URL, diskstat.Path, humanize.Bytes(diskstat.Free), humanize.Bytes(required)), ErrFrom: "JOB", ServerUrl: bcksrv.URL})
264+
} else if backtype == "binlog" {
265+
cluster.SetState("WARN0143", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0141"], bcksrv.URL, diskstat.Path, humanize.Bytes(diskstat.Free), humanize.Bytes(required)), ErrFrom: "JOB", ServerUrl: bcksrv.URL})
266+
}
267+
268+
return fmt.Errorf("Not enough free space on %s for backup. Free: %s", diskstat.Path, humanize.Bytes(diskstat.Free))
269+
}
270+
271+
if backup {
272+
cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModTask, config.LvlInfo, "Free space is enough on %s: %s. Required: %s", diskstat.Path, humanize.Bytes(diskstat.Free), humanize.Bytes(required))
273+
}
274+
}
275+
276+
return nil
277+
}
278+
279+
func (cluster *Cluster) CheckAllBackupFreeSpace() {
280+
if !cluster.Conf.BackupCheckFreeSpace {
281+
return
282+
}
283+
284+
// Check based on treshold
285+
wg := sync.WaitGroup{}
286+
wg.Add(1)
287+
go func() {
288+
cluster.CheckBackupFreeSpace("logical", false)
289+
wg.Done()
290+
}()
291+
292+
// if estimate size is enabled, check the free space for physical and binlog backups too
293+
if cluster.Conf.BackupEstimateSize {
294+
wg.Add(2)
295+
go func() {
296+
cluster.CheckBackupFreeSpace("physical", false)
297+
wg.Done()
298+
}()
299+
go func() {
300+
cluster.CheckBackupFreeSpace("binlog", false)
301+
wg.Done()
302+
}()
303+
}
304+
305+
wg.Wait()
306+
}

cluster/cluster_set.go

+4
Original file line numberDiff line numberDiff line change
@@ -2461,3 +2461,7 @@ func (cluster *Cluster) RenameCluster(newClusterName string) error {
24612461
return nil
24622462

24632463
}
2464+
2465+
func (cluster *Cluster) SetLogStatsLevel(value int) {
2466+
cluster.Conf.LogStatsLevel = value
2467+
}

cluster/srv_job.go

+29-6
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,14 @@ func (server *ServerMonitor) JobInsertTask(task string, port string, repmanhost
245245
return res.LastInsertId()
246246
}
247247

248-
func (server *ServerMonitor) JobBackupPhysical() (int64, error) {
248+
func (server *ServerMonitor) JobBackupPhysical() error {
249249
//server can be nil as no dicovered master
250250
if server == nil {
251-
return 0, nil
251+
return nil
252252
}
253253

254254
if server.IsDown() {
255-
return 0, nil
255+
return nil
256256
}
257257

258258
cluster := server.ClusterGroup
@@ -297,7 +297,7 @@ func (server *ServerMonitor) JobBackupPhysical() (int64, error) {
297297

298298
if err != nil {
299299
cluster.SetInPhysicalBackupState(false)
300-
return 0, nil
300+
return nil
301301
}
302302

303303
now := time.Now()
@@ -308,6 +308,14 @@ func (server *ServerMonitor) JobBackupPhysical() (int64, error) {
308308
prevId = prev.Id
309309
}
310310

311+
// Check for previous backup size
312+
if cluster.Conf.BackupCheckFreeSpace {
313+
err = cluster.CheckBackupFreeSpace("physical", true)
314+
if err != nil {
315+
return err
316+
}
317+
}
318+
311319
// Remove from backup list, since the file will be replaced
312320
if !cluster.Conf.BackupKeepUntilValid {
313321
cluster.BackupMetaMap.Delete(prevId)
@@ -327,12 +335,12 @@ func (server *ServerMonitor) JobBackupPhysical() (int64, error) {
327335

328336
cluster.BackupMetaMap.Set(server.LastBackupMeta.Physical.Id, server.LastBackupMeta.Physical)
329337

330-
jobid, err := server.JobInsertTask(cluster.Conf.BackupPhysicalType, port, cluster.Conf.MonitorAddress)
338+
_, err = server.JobInsertTask(cluster.Conf.BackupPhysicalType, port, cluster.Conf.MonitorAddress)
331339
if err != nil {
332340
cluster.SetInPhysicalBackupState(false)
333341
}
334342

335-
return jobid, err
343+
return err
336344
}
337345

338346
func (server *ServerMonitor) JobReseedPhysicalBackup(backtype string) error {
@@ -2040,6 +2048,14 @@ func (server *ServerMonitor) JobBackupLogical() error {
20402048
prevId = prev.Id
20412049
}
20422050

2051+
// Check for previous backup size
2052+
if cluster.Conf.BackupCheckFreeSpace {
2053+
err = cluster.CheckBackupFreeSpace("logical", true)
2054+
if err != nil {
2055+
return err
2056+
}
2057+
}
2058+
20432059
// Remove from backup list, since the file will be replaced
20442060
if !cluster.Conf.BackupKeepUntilValid {
20452061
cluster.BackupMetaMap.Delete(prevId)
@@ -2437,6 +2453,13 @@ func (server *ServerMonitor) JobBackupBinlog(binlogfile string, isPurge bool) er
24372453
defer cluster.SetInBinlogBackupState(false)
24382454
}
24392455

2456+
if cluster.Conf.BackupCheckFreeSpace {
2457+
err = cluster.CheckBackupFreeSpace("binlog", true)
2458+
if err != nil {
2459+
return err
2460+
}
2461+
}
2462+
24402463
server.SetBackingUpBinaryLog(true)
24412464
defer server.SetBackingUpBinaryLog(false)
24422465

config/config.go

+12
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ type Config struct {
148148
LogSupportLevel int `scope:"server" mapstructure:"log-support-level" toml:"log-support-level" json:"logSupportLevel"`
149149
LogExternalScript bool `mapstructure:"log-external-script" toml:"log-external-script" json:"ExternalScript"`
150150
LogExternalScriptLevel int `mapstructure:"log-external-script-level" toml:"log-external-script-level" json:"logExternalScriptLevel"`
151+
LogStatsLevel int `scope:"server" mapstructure:"log-stats-level" toml:"log-stats-level" json:"logStatsLevel"`
151152
User string `mapstructure:"db-servers-credential" toml:"db-servers-credential" json:"dbServersCredential"`
152153
Hosts string `mapstructure:"db-servers-hosts" toml:"db-servers-hosts" json:"dbServersHosts"`
153154
DbServersChangeStateScript string `mapstructure:"db-servers-state-change-script" toml:"db-servers-state-change-script" json:"dbServersStateChangeScript"`
@@ -642,6 +643,12 @@ type Config struct {
642643
BackupSaveScript string `mapstructure:"backup-save-script" toml:"backup-save-script" json:"backupSaveScript"`
643644
BackupLoadScript string `mapstructure:"backup-load-script" toml:"backup-load-script" json:"backupLoadScript"`
644645
CompressBackups bool `mapstructure:"compress-backups" toml:"compress-backups" json:"compressBackups"`
646+
BackupCheckFreeSpace bool `mapstructure:"backup-check-free-space" toml:"backup-check-free-space" json:"backupCheckFreeSpace"`
647+
BackupDiskTresholdWarn int `mapstructure:"backup-disk-treshold-warn" toml:"backup-disk-treshold-warn" json:"backupDiskTresholdWarn"`
648+
BackupDiskTresholdCrit int `mapstructure:"backup-disk-treshold-crit" toml:"backup-disk-treshold-crit" json:"backupDiskTresholdCrit"`
649+
BackupEstimateSize bool `mapstructure:"backup-estimate-size" toml:"backup-estimate-size" json:"backupEstimateSize"`
650+
BackupEstimateSizePercentage int `mapstructure:"backup-estimate-size-percentage" toml:"backup-estimate-size-percentage" json:"backupEstimateSizePercentage"`
651+
BackupGrowthPercentage int `mapstructure:"backup-growth-percentage" toml:"backup-growth-percentage" json:"backupGrowthPercentage"`
645652
SchedulerDatabaseLogsTableRotate bool `mapstructure:"scheduler-db-servers-logs-table-rotate" toml:"scheduler-db-servers-logs-table-rotate" json:"schedulerDbServersLogsTableRotate"`
646653
SchedulerDatabaseLogsTableRotateCron string `mapstructure:"scheduler-db-servers-logs-table-rotate-cron" toml:"scheduler-db-servers-logs-table-rotate-cron" json:"schedulerDbServersLogsTableRotateCron"`
647654
SchedulerMaintenanceDatabaseLogsTableKeep int `mapstructure:"scheduler-db-servers-logs-table-keep" toml:"scheduler-db-servers-logs-table-keep" json:"schedulerDatabaseLogsTableKeep"`
@@ -1162,6 +1169,7 @@ const (
11621169
ConstLogModMailer = 19
11631170
ConstLogModSupport = 20
11641171
ConstLogModExternalScript = 21
1172+
ConstLogModStats = 22
11651173
)
11661174

11671175
/*
@@ -2989,6 +2997,8 @@ func (conf *Config) IsEligibleForPrinting(module int, level string) bool {
29892997
return conf.LogMailerLevel >= lvl
29902998
case module == ConstLogModSupport:
29912999
return conf.LogSupportLevel >= lvl
3000+
case module == ConstLogModStats:
3001+
return conf.LogStatsLevel >= lvl
29923002
}
29933003
}
29943004

@@ -3154,6 +3164,8 @@ func GetTagsForLog(module int) string {
31543164
return "job"
31553165
case ConstLogModExternalScript:
31563166
return "externalscript"
3167+
case ConstLogModStats:
3168+
return "stats"
31573169
}
31583170
return ""
31593171
}

config/error.go

+5
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ var ClusterError = map[string]string{
201201
"WARN0136": "Enforce GTID replication is not yet available in MySQL. Skipping server %s",
202202
"WARN0137": "Unable to open certificate file %s. Err: %s",
203203
"WARN0138": "Unable to send mail alert. Please check your mailer configuration!",
204+
"WARN0139": "Backup partition used size exceed warning treshold. Mount: %s Used: %s Treshold: %s",
205+
"WARN0140": "Backup partition used size exceed critical treshold. Mount: %s Used: %s Treshold: %s",
206+
"WARN0141": "Not enough free space estimated for %s logical backup on %s. Mount: %s Free: %s Required: %s",
207+
"WARN0142": "Not enough free space estimated for %s physical backup on %s. Mount: %s Free: %s Required: %s",
208+
"WARN0143": "Not enough free space estimated for binary log backup on %s. Mount: %s Free: %s Required: %s",
204209
"MDEV20821": "MariaDB version has replication issue https://jira.mariadb.org/browse/MDEV-20821",
205210
"MDEV28310": "MariaDB version has replication issue for non row format https://jira.mariadb.org/browse/MDEV-28310",
206211
"MDEV19577": "MariaDB version has replication issue for non row format https://jira.mariadb.org/browse/MDEV-19577",

server/api_cluster.go

+26
Original file line numberDiff line numberDiff line change
@@ -2153,6 +2153,10 @@ func (repman *ReplicationManager) switchClusterSettings(mycluster *cluster.Clust
21532153
mycluster.SwitchBackupBinlogs()
21542154
case "compress-backups":
21552155
mycluster.SwitchCompressBackups()
2156+
case "backup-check-free-space":
2157+
mycluster.Conf.BackupCheckFreeSpace = !mycluster.Conf.BackupCheckFreeSpace
2158+
case "backup-estimate-size":
2159+
mycluster.Conf.BackupEstimateSize = !mycluster.Conf.BackupEstimateSize
21562160
case "monitoring-pause":
21572161
mycluster.SwitchMonitoringPause()
21582162
case "monitoring-save-config":
@@ -2523,6 +2527,18 @@ func (repman *ReplicationManager) setClusterSetting(mycluster *cluster.Cluster,
25232527
if err != nil {
25242528
return err
25252529
}
2530+
case "backup-disk-treshold-warn":
2531+
val, _ := strconv.Atoi(value)
2532+
mycluster.Conf.BackupDiskTresholdWarn = val
2533+
case "backup-disk-treshold-crit":
2534+
val, _ := strconv.Atoi(value)
2535+
mycluster.Conf.BackupDiskTresholdCrit = val
2536+
case "backup-estimate-size-percentage":
2537+
val, _ := strconv.Atoi(value)
2538+
mycluster.Conf.BackupEstimateSizePercentage = val
2539+
case "backup-growth-percentage":
2540+
val, _ := strconv.Atoi(value)
2541+
mycluster.Conf.BackupGrowthPercentage = val
25262542
case "backup-logical-type":
25272543
mycluster.SetBackupLogicalType(value)
25282544
case "backup-physical-type":
@@ -2718,6 +2734,9 @@ func (repman *ReplicationManager) setClusterSetting(mycluster *cluster.Cluster,
27182734
case "log-external-script-level":
27192735
val, _ := strconv.Atoi(value)
27202736
mycluster.SetLogExternalScriptLevel(val)
2737+
case "log-stats-level":
2738+
val, _ := strconv.Atoi(value)
2739+
mycluster.Conf.LogStatsLevel = val
27212740
case "monitoring-ignore-errors":
27222741
mycluster.SetMonitorIgnoreErrors(value)
27232742
case "monitoring-capture-trigger":
@@ -3135,6 +3154,10 @@ func (repman *ReplicationManager) setClusterSetting(mycluster *cluster.Cluster,
31353154
}
31363155
case "compress-backups":
31373156
mycluster.Conf.CompressBackups = isactive
3157+
case "backup-check-free-space":
3158+
mycluster.Conf.BackupCheckFreeSpace = isactive
3159+
case "backup-estimate-size":
3160+
mycluster.Conf.BackupEstimateSize = isactive
31383161
case "monitoring-pause":
31393162
mycluster.Conf.MonitorPause = isactive
31403163
case "monitoring-save-config":
@@ -3532,6 +3555,9 @@ func (repman *ReplicationManager) setRepmanSetting(name string, value string) er
35323555
case "log-support-level":
35333556
val, _ := strconv.Atoi(value)
35343557
repman.Conf.SetLogSupportLevel(val)
3558+
case "log-stats-level":
3559+
val, _ := strconv.Atoi(value)
3560+
repman.Conf.LogStatsLevel = val
35353561
case "mail-smtp-addr":
35363562
repman.Conf.SetMailSmtpAddr(value)
35373563
repman.Mailer.UpdateAddress(value)

server/repmanv3.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ func (s *ReplicationManager) MasterPhysicalBackup(ctx context.Context, in *v3.Cl
408408
if m == nil {
409409
return nil, v3.NewErrorResource(codes.InvalidArgument, v3.ErrClusterMasterNotSet, "cluster", in.Name).Err()
410410
}
411-
_, err = m.JobBackupPhysical()
411+
err = m.JobBackupPhysical()
412412
return &emptypb.Empty{}, err
413413
}
414414

@@ -608,7 +608,7 @@ func (s *ReplicationManager) PerformClusterAction(ctx context.Context, in *v3.Cl
608608
if m == nil {
609609
return nil, v3.NewErrorResource(codes.InvalidArgument, v3.ErrClusterMasterNotSet, "cluster", in.Cluster.Name).Err()
610610
}
611-
_, err = m.JobBackupPhysical()
611+
err = m.JobBackupPhysical()
612612
case v3.ClusterAction_OPTIMIZE:
613613
mycluster.RollingOptimize()
614614
case v3.ClusterAction_RESET_FAILOVER_CONTROL:

0 commit comments

Comments
 (0)