@@ -18,15 +18,14 @@ import (
1818 "context"
1919 "errors"
2020 "fmt"
21- "github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
22- "github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
23- "github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
24- "github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
25- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2621 "log/slog"
2722 "strings"
2823 "time"
2924
25+ "github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
26+ "github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
27+ "github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
28+ "github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
3029 "k8s.io/client-go/util/workqueue"
3130 ctrl "sigs.k8s.io/controller-runtime"
3231 "sigs.k8s.io/controller-runtime/pkg/builder"
@@ -54,7 +53,6 @@ type ReconcilerConfig struct {
5453 EnableLogCollector bool
5554 UpdateMaxRetries int
5655 UpdateRetryDelay time.Duration
57- test metav1.Time
5856}
5957
6058// FaultRemediationReconciler reconciles health events from a datastore change stream
@@ -170,7 +168,11 @@ func (r *FaultRemediationReconciler) shouldSkipEvent(ctx context.Context,
170168}
171169
172170// runLogCollector runs log collector for non-NONE actions if enabled
173- func (r * FaultRemediationReconciler ) runLogCollector (ctx context.Context , healthEvent * protos.HealthEvent , eventUID string ) (ctrl.Result , error ) {
171+ func (r * FaultRemediationReconciler ) runLogCollector (
172+ ctx context.Context ,
173+ healthEvent * protos.HealthEvent ,
174+ eventUID string ,
175+ ) (ctrl.Result , error ) {
174176 if healthEvent .RecommendedAction == protos .RecommendedAction_NONE || ! r .Config .EnableLogCollector {
175177 return ctrl.Result {}, nil
176178 }
@@ -204,6 +206,7 @@ func (r *FaultRemediationReconciler) performRemediation(
204206 if err != nil {
205207 slog .Error ("Error updating node label to remediating" , "error" , err )
206208 metrics .ProcessingErrors .WithLabelValues ("label_update_error" , nodeName ).Inc ()
209+
207210 return "" , err
208211 }
209212
@@ -213,9 +216,11 @@ func (r *FaultRemediationReconciler) performRemediation(
213216 }
214217
215218 remediationLabelValue := statemanager .RemediationSucceededLabelValue
219+
216220 crName , createMaintenanceResourceError := r .Config .RemediationClient .CreateMaintenanceResource (ctx , healthEventData )
217221 if createMaintenanceResourceError != nil {
218222 metrics .ProcessingErrors .WithLabelValues ("cr_creation_failed" , nodeName ).Inc ()
223+
219224 remediationLabelValue = statemanager .RemediationFailedLabelValue
220225 // don't throw error yet so we can update state
221226 }
@@ -228,8 +233,10 @@ func (r *FaultRemediationReconciler) performRemediation(
228233 "label" , remediationLabelValue ,
229234 "error" , err )
230235 metrics .ProcessingErrors .WithLabelValues ("label_update_error" , nodeName ).Inc ()
236+
231237 return "" , errors .Join (createMaintenanceResourceError , err )
232238 }
239+
233240 if createMaintenanceResourceError != nil {
234241 return "" , createMaintenanceResourceError
235242 }
@@ -268,6 +275,7 @@ func (r *FaultRemediationReconciler) handleCancellationEvent(
268275}
269276
270277// handleRemediationEvent processes remediation for quarantined nodes
278+ // nolint: cyclop // todo
271279func (r * FaultRemediationReconciler ) handleRemediationEvent (
272280 ctx context.Context ,
273281 healthEventWithStatus * events.HealthEventDoc ,
@@ -283,6 +291,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
283291 if err := watcherInstance .MarkProcessed (ctx , eventWithToken .ResumeToken ); err != nil {
284292 metrics .ProcessingErrors .WithLabelValues ("mark_processed_error" , nodeName ).Inc ()
285293 slog .Error ("Error updating resume token" , "error" , err )
294+
286295 return ctrl.Result {}, err
287296 }
288297
@@ -307,6 +316,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
307316 if err = watcherInstance .MarkProcessed (ctx , eventWithToken .ResumeToken ); err != nil {
308317 metrics .ProcessingErrors .WithLabelValues ("mark_processed_error" , nodeName ).Inc ()
309318 slog .Error ("Error updating resume token" , "error" , err )
319+
310320 return ctrl.Result {}, err
311321 }
312322
@@ -320,13 +330,15 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
320330 }
321331
322332 _ , performRemediationErr := r .performRemediation (ctx , healthEventWithStatus )
333+
323334 nodeRemediatedStatus := performRemediationErr == nil // success if no error thrown
324335 if err = r .updateNodeRemediatedStatus (ctx , healthEventStore , eventWithToken , nodeRemediatedStatus ); err != nil {
325336 metrics .ProcessingErrors .WithLabelValues ("update_status_error" , nodeName ).Inc ()
326337 slog .Error ("Error updating remediation status for node" , "error" , err )
327338
328339 return ctrl.Result {}, errors .Join (performRemediationErr , err )
329340 }
341+
330342 if performRemediationErr != nil {
331343 return ctrl.Result {}, performRemediationErr
332344 }
@@ -336,6 +348,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
336348 if err = watcherInstance .MarkProcessed (ctx , eventWithToken .ResumeToken ); err != nil {
337349 metrics .ProcessingErrors .WithLabelValues ("mark_processed_error" , nodeName ).Inc ()
338350 slog .Error ("Error updating resume token" , "error" , err )
351+
339352 return ctrl.Result {}, err
340353 }
341354
0 commit comments