Skip to content

Commit 6b669fb

Browse files
itcmsgrclaude
andcommitted
feat(rebuild): v1.96 PR-01 — rebuild recovery types + marker foundation
Add internal/rebuild package with: - OperationResult enum (SUCCESS, FAILED_RECOVERED, FAILED_DEGRADED, FAILED_FATAL) - FailureClass enum (12 classes: PREVALIDATION_FAILED through RETRY_EXHAUSTED) - ModuleRestoreResult enum (3-level verification: structure, wiring, activation) - RetryDisposition enum + GetRetryDisposition() policy function - RecoveryMarker struct with JSON persistence (read/write/clear) - ModuleRestoreReport with per-module tracking - Comprehensive tests for policy logic, marker lifecycle, serialization Contract: V196_REBUILD_RECOVERY_CONTRACT.md Invariants: INV-RR-001 through INV-RR-010 No behavior change — foundation types only. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4255053 commit 6b669fb

5 files changed

Lines changed: 888 additions & 0 deletions

File tree

internal/rebuild/marker.go

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// =============================================================================
2+
// NFTBan v1.96 - Recovery Marker Persistence
3+
// =============================================================================
4+
// SPDX-License-Identifier: MPL-2.0
5+
// meta:name="rebuild-marker"
6+
// meta:type="lib"
7+
// meta:version="1.96.0"
8+
// meta:owner="Antonios Voulvoulis <contact@nftban.com>"
9+
// meta:created_date="2026-04-17"
10+
// meta:description="Persistent recovery marker for rebuild failure state and retry tracking"
11+
// meta:inventory.files="internal/rebuild/marker.go"
12+
// meta:inventory.binaries=""
13+
// meta:inventory.env_vars=""
14+
// meta:inventory.config_files=""
15+
// meta:inventory.systemd_units=""
16+
// meta:inventory.network=""
17+
// meta:inventory.privileges="none"
18+
//
19+
// Contract: V196_REBUILD_RECOVERY_CONTRACT.md §7
20+
// Marker path: /var/lib/nftban/state/rebuild_recovery.json
21+
// INV-RR-006: Retry count is finite and persisted
22+
// =============================================================================
23+
24+
package rebuild
25+
26+
import (
27+
"encoding/json"
28+
"fmt"
29+
"os"
30+
"path/filepath"
31+
"time"
32+
)
33+
34+
// DefaultMarkerPath is the canonical location for the recovery marker.
35+
const DefaultMarkerPath = "/var/lib/nftban/state/rebuild_recovery.json"
36+
37+
// RecoveryMarker persists rebuild failure state for recovery tracking.
38+
// Written on any non-SUCCESS rebuild outcome (except PREVALIDATION_FAILED).
39+
// Cleared on SUCCESS. Read by deferred retry service.
40+
type RecoveryMarker struct {
41+
// FailureClass categorizes the root cause.
42+
FailureClass FailureClass `json:"failure_class"`
43+
44+
// OperationResult is the rebuild operation outcome.
45+
OperationResult OperationResult `json:"operation_result"`
46+
47+
// RetryCount is the number of retry attempts so far (immediate + deferred).
48+
RetryCount int `json:"retry_count"`
49+
50+
// MaxRetries is the configured maximum total retries.
51+
MaxRetries int `json:"max_retries"`
52+
53+
// DeferredRetryPending indicates a deferred retry should be attempted.
54+
DeferredRetryPending bool `json:"deferred_retry_pending"`
55+
56+
// FirstFailureAt is the timestamp of the initial failure.
57+
FirstFailureAt time.Time `json:"first_failure_at"`
58+
59+
// LastFailureAt is the timestamp of the most recent failure/retry.
60+
LastFailureAt time.Time `json:"last_failure_at"`
61+
62+
// RollbackAttempted indicates whether rollback was triggered.
63+
RollbackAttempted bool `json:"rollback_attempted"`
64+
65+
// RollbackResult is the outcome of the rollback attempt.
66+
RollbackResult string `json:"rollback_result"` // "success", "failed", "not_attempted"
67+
68+
// BackupPath is the snapshot directory used for rollback.
69+
BackupPath string `json:"backup_path"`
70+
71+
// LastHealthState is the validator state after recovery settled.
72+
LastHealthState string `json:"last_health_state"` // "protected", "degraded", "down"
73+
74+
// Exhausted is true when max retries are reached.
75+
Exhausted bool `json:"exhausted"`
76+
77+
// ModuleRestore captures per-module restore outcomes.
78+
ModuleRestore *ModuleRestoreReport `json:"module_restore,omitempty"`
79+
80+
// DaemonRelated indicates whether the failure involved daemon unavailability.
81+
DaemonRelated bool `json:"daemon_related"`
82+
}
83+
84+
// ReadMarker reads a recovery marker from the default path.
85+
// Returns nil, nil if marker does not exist (no recovery pending).
86+
func ReadMarker() (*RecoveryMarker, error) {
87+
return ReadMarkerFrom(DefaultMarkerPath)
88+
}
89+
90+
// ReadMarkerFrom reads a recovery marker from a specific path.
91+
// Returns nil, nil if marker does not exist.
92+
func ReadMarkerFrom(path string) (*RecoveryMarker, error) {
93+
data, err := os.ReadFile(path) // #nosec G304 — path is controlled
94+
if err != nil {
95+
if os.IsNotExist(err) {
96+
return nil, nil
97+
}
98+
return nil, fmt.Errorf("read recovery marker: %w", err)
99+
}
100+
101+
var m RecoveryMarker
102+
if err := json.Unmarshal(data, &m); err != nil {
103+
return nil, fmt.Errorf("parse recovery marker: %w", err)
104+
}
105+
106+
return &m, nil
107+
}
108+
109+
// Write persists the recovery marker atomically.
110+
// Uses temp file + rename for crash safety.
111+
func (m *RecoveryMarker) Write() error {
112+
return m.WriteTo(DefaultMarkerPath)
113+
}
114+
115+
// WriteTo persists the recovery marker to a specific path.
116+
func (m *RecoveryMarker) WriteTo(path string) error {
117+
data, err := json.MarshalIndent(m, "", " ")
118+
if err != nil {
119+
return fmt.Errorf("marshal recovery marker: %w", err)
120+
}
121+
122+
dir := filepath.Dir(path)
123+
if err := os.MkdirAll(dir, 0750); err != nil {
124+
return fmt.Errorf("create marker directory: %w", err)
125+
}
126+
127+
tmp := path + ".tmp"
128+
if err := os.WriteFile(tmp, data, 0640); err != nil { // #nosec G306 — intentional 0640
129+
return fmt.Errorf("write temp marker: %w", err)
130+
}
131+
132+
if err := os.Rename(tmp, path); err != nil {
133+
_ = os.Remove(tmp) // best-effort cleanup
134+
return fmt.Errorf("rename marker: %w", err)
135+
}
136+
137+
return nil
138+
}
139+
140+
// Clear removes the recovery marker (rebuild succeeded).
141+
func Clear() error {
142+
return ClearFrom(DefaultMarkerPath)
143+
}
144+
145+
// ClearFrom removes a recovery marker at a specific path.
146+
func ClearFrom(path string) error {
147+
err := os.Remove(path)
148+
if err != nil && !os.IsNotExist(err) {
149+
return fmt.Errorf("clear recovery marker: %w", err)
150+
}
151+
return nil
152+
}
153+
154+
// NewMarker creates a new recovery marker for an initial failure.
155+
func NewMarker(class FailureClass, result OperationResult) *RecoveryMarker {
156+
now := time.Now()
157+
return &RecoveryMarker{
158+
FailureClass: class,
159+
OperationResult: result,
160+
RetryCount: 0,
161+
MaxRetries: MaxImmediateRetries + MaxDeferredRetries,
162+
DeferredRetryPending: false,
163+
FirstFailureAt: now,
164+
LastFailureAt: now,
165+
RollbackAttempted: false,
166+
RollbackResult: "not_attempted",
167+
Exhausted: false,
168+
}
169+
}
170+
171+
// IncrementRetry updates the marker for a retry attempt.
172+
func (m *RecoveryMarker) IncrementRetry() {
173+
m.RetryCount++
174+
m.LastFailureAt = time.Now()
175+
if m.RetryCount >= m.MaxRetries {
176+
m.Exhausted = true
177+
m.DeferredRetryPending = false
178+
}
179+
}
180+
181+
// ShouldDeferRetry returns true if a deferred retry should be scheduled.
182+
func (m *RecoveryMarker) ShouldDeferRetry() bool {
183+
if m.Exhausted {
184+
return false
185+
}
186+
if neverRetryClasses[m.FailureClass] {
187+
return false
188+
}
189+
return m.RetryCount < m.MaxRetries
190+
}
191+
192+
// SetRollbackResult records the rollback outcome.
193+
func (m *RecoveryMarker) SetRollbackResult(success bool) {
194+
m.RollbackAttempted = true
195+
if success {
196+
m.RollbackResult = "success"
197+
} else {
198+
m.RollbackResult = "failed"
199+
}
200+
}

0 commit comments

Comments
 (0)