Skip to content

Commit 1956d7a

Browse files
committed
fix: enhance cleanup metrics tracking and reporting in backup process
1 parent 16b862a commit 1956d7a

File tree

2 files changed

+136
-27
lines changed

2 files changed

+136
-27
lines changed

cmd/main.go

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -151,19 +151,10 @@ func runBackup(configFile, logLevel string, dryRun bool, databases string, force
151151
log.WithError(err).Warn("Failed to initialize file logger, using stdout")
152152
}
153153

154-
// Initialize Prometheus metrics if enabled (before any user interaction)
154+
// Initialize Prometheus metrics for recording only (no HTTP server in main binary)
155155
if cfg.Metrics.Enabled {
156156
metrics.Init()
157-
log.WithField("port", cfg.Metrics.Port).Debug("Initializing Prometheus metrics server")
158-
go func() {
159-
if err := metrics.StartMetricsServer(cfg.Metrics.Port); err != nil {
160-
log.WithError(err).WithField("port", cfg.Metrics.Port).Warn("Metrics server failed to start (backup will continue)")
161-
} else {
162-
log.WithField("port", cfg.Metrics.Port).Debug("Metrics server started successfully")
163-
}
164-
}()
165-
// Give metrics server a moment to start and potentially fail
166-
time.Sleep(200 * time.Millisecond)
157+
log.Debug("Metrics recording enabled")
167158
}
168159

169160
if dryRun {
@@ -393,15 +384,26 @@ func runCleanup(configFile, logLevel string, dryRun bool, force bool, databases
393384
cleanupStartTime := time.Now()
394385
var totalFilesRemoved int64
395386
var totalBytesFreed int64
387+
388+
// Record Prometheus metrics if enabled
389+
if cfg.Metrics.Enabled {
390+
metrics.RecordCleanupStart()
391+
}
396392

397393
// Perform cleanup of uploaded files (only if backup service is available)
398394
if backupService != nil {
399395
if err := backupService.CleanupUploadedFiles(ctx); err != nil {
400396
log.WithError(err).Error("Cleanup process failed")
401397
cleanupDuration := time.Since(cleanupStartTime)
402-
if cfg.Metrics.Enabled && metricsStorage != nil {
403-
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, false, totalFilesRemoved, totalBytesFreed); err != nil {
404-
log.WithError(err).Warn("Failed to update cleanup metrics")
398+
if cfg.Metrics.Enabled {
399+
// Record failed cleanup in Prometheus
400+
metrics.RecordCleanupEnd(cleanupDuration, false, totalFilesRemoved, totalBytesFreed)
401+
402+
// Record in persistent storage
403+
if metricsStorage != nil {
404+
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, false, totalFilesRemoved, totalBytesFreed); err != nil {
405+
log.WithError(err).Warn("Failed to update cleanup metrics")
406+
}
405407
}
406408
}
407409
os.Exit(1)
@@ -416,22 +418,39 @@ func runCleanup(configFile, logLevel string, dryRun bool, force bool, databases
416418
maxAgeDays = 7 // Safe default: 7 days
417419
}
418420

419-
if err := cleanupOldBackupFiles(cfg.Backup.Directory, selectedDatabases, maxAgeDays, log); err != nil {
421+
filesRemoved, bytesFreed, err := cleanupOldBackupFiles(cfg.Backup.Directory, selectedDatabases, maxAgeDays, log)
422+
if err != nil {
420423
log.WithError(err).Error("Age-based cleanup failed")
421424
cleanupDuration := time.Since(cleanupStartTime)
422-
if cfg.Metrics.Enabled && metricsStorage != nil {
423-
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, false, totalFilesRemoved, totalBytesFreed); err != nil {
424-
log.WithError(err).Warn("Failed to update cleanup metrics")
425+
if cfg.Metrics.Enabled {
426+
// Record failed cleanup in Prometheus
427+
metrics.RecordCleanupEnd(cleanupDuration, false, totalFilesRemoved, totalBytesFreed)
428+
429+
// Record in persistent storage
430+
if metricsStorage != nil {
431+
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, false, totalFilesRemoved, totalBytesFreed); err != nil {
432+
log.WithError(err).Warn("Failed to update cleanup metrics")
433+
}
425434
}
426435
}
427436
os.Exit(1)
428437
}
438+
439+
// Update metrics counters
440+
totalFilesRemoved += filesRemoved
441+
totalBytesFreed += bytesFreed
429442

430443
// Record successful cleanup
431444
cleanupDuration := time.Since(cleanupStartTime)
432-
if cfg.Metrics.Enabled && metricsStorage != nil {
433-
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, true, totalFilesRemoved, totalBytesFreed); err != nil {
434-
log.WithError(err).Warn("Failed to update cleanup metrics")
445+
if cfg.Metrics.Enabled {
446+
// Record in Prometheus metrics
447+
metrics.RecordCleanupEnd(cleanupDuration, true, totalFilesRemoved, totalBytesFreed)
448+
449+
// Record in persistent storage
450+
if metricsStorage != nil {
451+
if err := metricsStorage.UpdateCleanupMetrics(cleanupDuration, true, totalFilesRemoved, totalBytesFreed); err != nil {
452+
log.WithError(err).Warn("Failed to update cleanup metrics")
453+
}
435454
}
436455
}
437456

@@ -1145,8 +1164,8 @@ func formatDuration(d time.Duration) string {
11451164
}
11461165
}
11471166

1148-
// cleanupOldBackupFiles removes backup files older than specified days
1149-
func cleanupOldBackupFiles(backupDir string, selectedDatabases []string, maxAgeDays int, log *logger.Logger) error {
1167+
// cleanupOldBackupFiles removes backup files older than specified days and returns metrics
1168+
func cleanupOldBackupFiles(backupDir string, selectedDatabases []string, maxAgeDays int, log *logger.Logger) (int64, int64, error) {
11501169
// Get all backup files
11511170
allBackupFiles := getBackupFiles(backupDir, selectedDatabases)
11521171

@@ -1158,20 +1177,28 @@ func cleanupOldBackupFiles(backupDir string, selectedDatabases []string, maxAgeD
11581177
}
11591178
}
11601179

1161-
// Delete old files
1180+
// Delete old files and track metrics
1181+
var deletedFiles int64
1182+
var bytesFreed int64
1183+
11621184
for _, fileInfo := range filesToDelete {
11631185
log.WithField("file", fileInfo.Name).
11641186
WithField("age_days", int(time.Since(fileInfo.ModTime).Hours()/24)).
11651187
Info("🗑️ Deleting old backup file")
11661188

11671189
if err := os.RemoveAll(fileInfo.Path); err != nil {
11681190
log.WithError(err).WithField("file", fileInfo.Path).Error("Failed to delete backup file")
1169-
return fmt.Errorf("failed to delete %s: %w", fileInfo.Path, err)
1191+
return deletedFiles, bytesFreed, fmt.Errorf("failed to delete %s: %w", fileInfo.Path, err)
11701192
}
1193+
1194+
deletedFiles++
1195+
bytesFreed += fileInfo.Size
11711196
}
11721197

1173-
log.WithField("deleted_files", len(filesToDelete)).Info("✅ Age-based cleanup completed")
1174-
return nil
1198+
log.WithField("deleted_files", deletedFiles).
1199+
WithField("bytes_freed", bytesFreed).
1200+
Info("✅ Age-based cleanup completed")
1201+
return deletedFiles, bytesFreed, nil
11751202
}
11761203

11771204
// formatFileSize formats file size in human readable format
@@ -2448,6 +2475,7 @@ func loadPartialConfig(configFile string) (*config.Config, error) {
24482475
viper.SetDefault("logging.file_format", "text")
24492476
viper.SetDefault("upload.enabled", false)
24502477
viper.SetDefault("metrics.enabled", false)
2478+
// Note: No port default for main binary - only used by exporter
24512479

24522480
if configFile != "" {
24532481
viper.SetConfigFile(configFile)

internal/metrics/metrics.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,57 @@ var (
158158
[]string{"provider"},
159159
)
160160

161+
// === CLEANUP METRICS ===
162+
163+
// Cleanup duration metric
164+
CleanupDurationSeconds = prometheus.NewHistogram(
165+
prometheus.HistogramOpts{
166+
Name: "tenangdb_cleanup_duration_seconds",
167+
Help: "Duration of cleanup operations in seconds",
168+
Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600},
169+
},
170+
)
171+
172+
// Cleanup success counter
173+
CleanupSuccessTotal = prometheus.NewCounter(
174+
prometheus.CounterOpts{
175+
Name: "tenangdb_cleanup_success_total",
176+
Help: "Total number of successful cleanup operations",
177+
},
178+
)
179+
180+
// Cleanup failure counter
181+
CleanupFailedTotal = prometheus.NewCounter(
182+
prometheus.CounterOpts{
183+
Name: "tenangdb_cleanup_failed_total",
184+
Help: "Total number of failed cleanup operations",
185+
},
186+
)
187+
188+
// Files removed counter
189+
FilesRemovedTotal = prometheus.NewCounter(
190+
prometheus.CounterOpts{
191+
Name: "tenangdb_cleanup_files_removed_total",
192+
Help: "Total number of files removed during cleanup",
193+
},
194+
)
195+
196+
// Bytes freed counter
197+
BytesFreedTotal = prometheus.NewCounter(
198+
prometheus.CounterOpts{
199+
Name: "tenangdb_cleanup_bytes_freed_total",
200+
Help: "Total number of bytes freed during cleanup",
201+
},
202+
)
203+
204+
// Last cleanup timestamp
205+
LastCleanupTimestamp = prometheus.NewGauge(
206+
prometheus.GaugeOpts{
207+
Name: "tenangdb_cleanup_last_timestamp",
208+
Help: "Timestamp of the last cleanup operation",
209+
},
210+
)
211+
161212
// === SYSTEM METRICS ===
162213

163214
// System health status
@@ -228,6 +279,14 @@ func Init() {
228279
RestoreFailedTotal,
229280
LastRestoreTimestamp,
230281

282+
// Cleanup metrics
283+
CleanupDurationSeconds,
284+
CleanupSuccessTotal,
285+
CleanupFailedTotal,
286+
FilesRemovedTotal,
287+
BytesFreedTotal,
288+
LastCleanupTimestamp,
289+
231290
// System metrics
232291
TotalDatabases,
233292
SystemHealthStatus,
@@ -291,6 +350,28 @@ func RecordRestoreEnd(database string, duration time.Duration, success bool) {
291350
ActiveOperations.WithLabelValues("restore").Dec()
292351
}
293352

353+
// === CLEANUP FUNCTIONS ===
354+
355+
// RecordCleanupStart records the start of a cleanup operation
356+
func RecordCleanupStart() {
357+
ActiveOperations.WithLabelValues("cleanup").Inc()
358+
}
359+
360+
// RecordCleanupEnd records the end of a cleanup operation
361+
func RecordCleanupEnd(duration time.Duration, success bool, filesRemoved int64, bytesFreed int64) {
362+
if success {
363+
CleanupSuccessTotal.Inc()
364+
} else {
365+
CleanupFailedTotal.Inc()
366+
}
367+
368+
CleanupDurationSeconds.Observe(duration.Seconds())
369+
FilesRemovedTotal.Add(float64(filesRemoved))
370+
BytesFreedTotal.Add(float64(bytesFreed))
371+
LastCleanupTimestamp.Set(float64(time.Now().Unix()))
372+
ActiveOperations.WithLabelValues("cleanup").Dec()
373+
}
374+
294375
// === UPLOAD FUNCTIONS ===
295376

296377
// RecordUploadBytes records bytes uploaded

0 commit comments

Comments
 (0)