|
| 1 | +// ============================================================================= |
| 2 | +// NFTBan v1.96 - Recovery Marker Persistence |
| 3 | +// ============================================================================= |
| 4 | +// SPDX-License-Identifier: MPL-2.0 |
| 5 | +// meta:name="rebuild-marker" |
| 6 | +// meta:type="lib" |
| 7 | +// meta:version="1.96.0" |
| 8 | +// meta:owner="Antonios Voulvoulis <contact@nftban.com>" |
| 9 | +// meta:created_date="2026-04-17" |
| 10 | +// meta:description="Persistent recovery marker for rebuild failure state and retry tracking" |
| 11 | +// meta:inventory.files="internal/rebuild/marker.go" |
| 12 | +// meta:inventory.binaries="" |
| 13 | +// meta:inventory.env_vars="" |
| 14 | +// meta:inventory.config_files="" |
| 15 | +// meta:inventory.systemd_units="" |
| 16 | +// meta:inventory.network="" |
| 17 | +// meta:inventory.privileges="none" |
| 18 | +// |
| 19 | +// Contract: V196_REBUILD_RECOVERY_CONTRACT.md §7 |
| 20 | +// Marker path: /var/lib/nftban/state/rebuild_recovery.json |
| 21 | +// INV-RR-006: Retry count is finite and persisted |
| 22 | +// ============================================================================= |
| 23 | + |
| 24 | +package rebuild |
| 25 | + |
| 26 | +import ( |
| 27 | + "encoding/json" |
| 28 | + "fmt" |
| 29 | + "os" |
| 30 | + "path/filepath" |
| 31 | + "time" |
| 32 | +) |
| 33 | + |
| 34 | +// DefaultMarkerPath is the canonical location for the recovery marker. |
| 35 | +const DefaultMarkerPath = "/var/lib/nftban/state/rebuild_recovery.json" |
| 36 | + |
| 37 | +// RecoveryMarker persists rebuild failure state for recovery tracking. |
| 38 | +// Written on any non-SUCCESS rebuild outcome (except PREVALIDATION_FAILED). |
| 39 | +// Cleared on SUCCESS. Read by deferred retry service. |
| 40 | +type RecoveryMarker struct { |
| 41 | + // FailureClass categorizes the root cause. |
| 42 | + FailureClass FailureClass `json:"failure_class"` |
| 43 | + |
| 44 | + // OperationResult is the rebuild operation outcome. |
| 45 | + OperationResult OperationResult `json:"operation_result"` |
| 46 | + |
| 47 | + // RetryCount is the number of retry attempts so far (immediate + deferred). |
| 48 | + RetryCount int `json:"retry_count"` |
| 49 | + |
| 50 | + // MaxRetries is the configured maximum total retries. |
| 51 | + MaxRetries int `json:"max_retries"` |
| 52 | + |
| 53 | + // DeferredRetryPending indicates a deferred retry should be attempted. |
| 54 | + DeferredRetryPending bool `json:"deferred_retry_pending"` |
| 55 | + |
| 56 | + // FirstFailureAt is the timestamp of the initial failure. |
| 57 | + FirstFailureAt time.Time `json:"first_failure_at"` |
| 58 | + |
| 59 | + // LastFailureAt is the timestamp of the most recent failure/retry. |
| 60 | + LastFailureAt time.Time `json:"last_failure_at"` |
| 61 | + |
| 62 | + // RollbackAttempted indicates whether rollback was triggered. |
| 63 | + RollbackAttempted bool `json:"rollback_attempted"` |
| 64 | + |
| 65 | + // RollbackResult is the outcome of the rollback attempt. |
| 66 | + RollbackResult string `json:"rollback_result"` // "success", "failed", "not_attempted" |
| 67 | + |
| 68 | + // BackupPath is the snapshot directory used for rollback. |
| 69 | + BackupPath string `json:"backup_path"` |
| 70 | + |
| 71 | + // LastHealthState is the validator state after recovery settled. |
| 72 | + LastHealthState string `json:"last_health_state"` // "protected", "degraded", "down" |
| 73 | + |
| 74 | + // Exhausted is true when max retries are reached. |
| 75 | + Exhausted bool `json:"exhausted"` |
| 76 | + |
| 77 | + // ModuleRestore captures per-module restore outcomes. |
| 78 | + ModuleRestore *ModuleRestoreReport `json:"module_restore,omitempty"` |
| 79 | + |
| 80 | + // DaemonRelated indicates whether the failure involved daemon unavailability. |
| 81 | + DaemonRelated bool `json:"daemon_related"` |
| 82 | +} |
| 83 | + |
| 84 | +// ReadMarker reads a recovery marker from the default path. |
| 85 | +// Returns nil, nil if marker does not exist (no recovery pending). |
| 86 | +func ReadMarker() (*RecoveryMarker, error) { |
| 87 | + return ReadMarkerFrom(DefaultMarkerPath) |
| 88 | +} |
| 89 | + |
| 90 | +// ReadMarkerFrom reads a recovery marker from a specific path. |
| 91 | +// Returns nil, nil if marker does not exist. |
| 92 | +func ReadMarkerFrom(path string) (*RecoveryMarker, error) { |
| 93 | + data, err := os.ReadFile(path) // #nosec G304 — path is controlled |
| 94 | + if err != nil { |
| 95 | + if os.IsNotExist(err) { |
| 96 | + return nil, nil |
| 97 | + } |
| 98 | + return nil, fmt.Errorf("read recovery marker: %w", err) |
| 99 | + } |
| 100 | + |
| 101 | + var m RecoveryMarker |
| 102 | + if err := json.Unmarshal(data, &m); err != nil { |
| 103 | + return nil, fmt.Errorf("parse recovery marker: %w", err) |
| 104 | + } |
| 105 | + |
| 106 | + return &m, nil |
| 107 | +} |
| 108 | + |
| 109 | +// Write persists the recovery marker atomically. |
| 110 | +// Uses temp file + rename for crash safety. |
| 111 | +func (m *RecoveryMarker) Write() error { |
| 112 | + return m.WriteTo(DefaultMarkerPath) |
| 113 | +} |
| 114 | + |
| 115 | +// WriteTo persists the recovery marker to a specific path. |
| 116 | +func (m *RecoveryMarker) WriteTo(path string) error { |
| 117 | + data, err := json.MarshalIndent(m, "", " ") |
| 118 | + if err != nil { |
| 119 | + return fmt.Errorf("marshal recovery marker: %w", err) |
| 120 | + } |
| 121 | + |
| 122 | + dir := filepath.Dir(path) |
| 123 | + if err := os.MkdirAll(dir, 0750); err != nil { |
| 124 | + return fmt.Errorf("create marker directory: %w", err) |
| 125 | + } |
| 126 | + |
| 127 | + tmp := path + ".tmp" |
| 128 | + if err := os.WriteFile(tmp, data, 0640); err != nil { // #nosec G306 — intentional 0640 |
| 129 | + return fmt.Errorf("write temp marker: %w", err) |
| 130 | + } |
| 131 | + |
| 132 | + if err := os.Rename(tmp, path); err != nil { |
| 133 | + _ = os.Remove(tmp) // best-effort cleanup |
| 134 | + return fmt.Errorf("rename marker: %w", err) |
| 135 | + } |
| 136 | + |
| 137 | + return nil |
| 138 | +} |
| 139 | + |
| 140 | +// Clear removes the recovery marker (rebuild succeeded). |
| 141 | +func Clear() error { |
| 142 | + return ClearFrom(DefaultMarkerPath) |
| 143 | +} |
| 144 | + |
| 145 | +// ClearFrom removes a recovery marker at a specific path. |
| 146 | +func ClearFrom(path string) error { |
| 147 | + err := os.Remove(path) |
| 148 | + if err != nil && !os.IsNotExist(err) { |
| 149 | + return fmt.Errorf("clear recovery marker: %w", err) |
| 150 | + } |
| 151 | + return nil |
| 152 | +} |
| 153 | + |
| 154 | +// NewMarker creates a new recovery marker for an initial failure. |
| 155 | +func NewMarker(class FailureClass, result OperationResult) *RecoveryMarker { |
| 156 | + now := time.Now() |
| 157 | + return &RecoveryMarker{ |
| 158 | + FailureClass: class, |
| 159 | + OperationResult: result, |
| 160 | + RetryCount: 0, |
| 161 | + MaxRetries: MaxImmediateRetries + MaxDeferredRetries, |
| 162 | + DeferredRetryPending: false, |
| 163 | + FirstFailureAt: now, |
| 164 | + LastFailureAt: now, |
| 165 | + RollbackAttempted: false, |
| 166 | + RollbackResult: "not_attempted", |
| 167 | + Exhausted: false, |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +// IncrementRetry updates the marker for a retry attempt. |
| 172 | +func (m *RecoveryMarker) IncrementRetry() { |
| 173 | + m.RetryCount++ |
| 174 | + m.LastFailureAt = time.Now() |
| 175 | + if m.RetryCount >= m.MaxRetries { |
| 176 | + m.Exhausted = true |
| 177 | + m.DeferredRetryPending = false |
| 178 | + } |
| 179 | +} |
| 180 | + |
| 181 | +// ShouldDeferRetry returns true if a deferred retry should be scheduled. |
| 182 | +func (m *RecoveryMarker) ShouldDeferRetry() bool { |
| 183 | + if m.Exhausted { |
| 184 | + return false |
| 185 | + } |
| 186 | + if neverRetryClasses[m.FailureClass] { |
| 187 | + return false |
| 188 | + } |
| 189 | + return m.RetryCount < m.MaxRetries |
| 190 | +} |
| 191 | + |
| 192 | +// SetRollbackResult records the rollback outcome. |
| 193 | +func (m *RecoveryMarker) SetRollbackResult(success bool) { |
| 194 | + m.RollbackAttempted = true |
| 195 | + if success { |
| 196 | + m.RollbackResult = "success" |
| 197 | + } else { |
| 198 | + m.RollbackResult = "failed" |
| 199 | + } |
| 200 | +} |
0 commit comments