Skip to content

Commit 5d99015

Browse files
author
Igor Velichkovich
committed
lint
1 parent f9ebc85 commit 5d99015

File tree

11 files changed

+246
-114
lines changed

11 files changed

+246
-114
lines changed

fault-remediation/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,13 @@ func setupNonCtrlRuntimeManaged(ctx context.Context, params initializer.Initiali
167167

168168
for event := range components.FaultRemediationReconciler.Watcher.Events() {
169169
slog.Info("Event received", "event", event)
170+
170171
for i := 1; i <= components.FaultRemediationReconciler.Config.UpdateMaxRetries; i++ {
171172
_, err = components.FaultRemediationReconciler.Reconcile(gCtx, &event)
172173
if err == nil {
173174
break
174175
}
176+
175177
slog.Error("Error processing event", "event", event, "error", err)
176178

177179
if i < components.FaultRemediationReconciler.Config.UpdateMaxRetries {

fault-remediation/pkg/annotation/annotation.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"log/slog"
8+
"time"
9+
710
corev1 "k8s.io/api/core/v1"
811
"k8s.io/apimachinery/pkg/types"
9-
"log/slog"
1012
"sigs.k8s.io/controller-runtime/pkg/client"
11-
"time"
1213
)
1314

1415
// CtrlRuntimeAnnotationManager manages node annotations for tracking remediation state.
@@ -29,6 +30,7 @@ func (m *CtrlRuntimeAnnotationManager) GetRemediationState(
2930
nodeName string,
3031
) (*RemediationStateAnnotation, *corev1.Node, error) {
3132
node := &corev1.Node{}
33+
3234
err := m.client.Get(ctx, types.NamespacedName{
3335
Name: nodeName,
3436
}, node)
@@ -105,15 +107,18 @@ func (m *CtrlRuntimeAnnotationManager) UpdateRemediationState(ctx context.Contex
105107
// ClearRemediationState removes the remediation state annotation from a node
106108
func (m *CtrlRuntimeAnnotationManager) ClearRemediationState(ctx context.Context, nodeName string) error {
107109
node := &corev1.Node{}
110+
108111
err := m.client.Get(ctx, types.NamespacedName{
109112
Name: nodeName,
110113
}, node)
111114
if err != nil {
112115
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
113116
}
117+
114118
if node.Annotations == nil {
115119
return nil
116120
}
121+
117122
patchBase := node.DeepCopy()
118123
delete(node.Annotations, AnnotationKey)
119124

fault-remediation/pkg/annotation/annotation_interface.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ package annotation
22

33
import (
44
"context"
5-
corev1 "k8s.io/api/core/v1"
65
"time"
6+
7+
corev1 "k8s.io/api/core/v1"
78
)
89

910
const (

fault-remediation/pkg/annotation/deprecated_annotation.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ func (m *NodeAnnotationManager) GetRemediationState(
9292
if isRetryableError(err) {
9393
slog.Warn("Retryable error getting node", "node", nodeName, "error", err)
9494
}
95+
9596
return err
9697
}
9798

fault-remediation/pkg/crstatus/checker.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ import (
1818
"context"
1919
"log/slog"
2020

21-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
2221
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2322
"k8s.io/apimachinery/pkg/runtime/schema"
2423
"sigs.k8s.io/controller-runtime/pkg/client"
24+
25+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
2526
)
2627

2728
type CtrlRuntimeCRStatusChecker struct {

fault-remediation/pkg/initializer/init.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@ package initializer
1717
import (
1818
"context"
1919
"fmt"
20-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
21-
"k8s.io/client-go/kubernetes"
2220
"log/slog"
2321
"time"
2422

23+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
24+
"k8s.io/client-go/kubernetes"
25+
2526
"github.com/nvidia/nvsentinel/commons/pkg/configmanager"
2627
"github.com/nvidia/nvsentinel/commons/pkg/statemanager"
2728
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
@@ -47,7 +48,11 @@ type Components struct {
4748
}
4849

4950
// nolint: cyclop // todo
50-
func InitializeAll(ctx context.Context, params InitializationParams, ctrlruntimeClient ctrlruntimeClient.Client) (*Components, error) {
51+
func InitializeAll(
52+
ctx context.Context,
53+
params InitializationParams,
54+
ctrlruntimeClient ctrlruntimeClient.Client,
55+
) (*Components, error) {
5156
slog.Info("Starting fault remediation module initialization")
5257

5358
if ctrlruntimeClient == nil && params.UseCtrlRuntime {
@@ -75,8 +80,11 @@ func InitializeAll(ctx context.Context, params InitializationParams, ctrlruntime
7580
slog.Info("Log collector enabled")
7681
}
7782

78-
var remediationClient remediation.FaultRemediationClientInterface
79-
var clientSet kubernetes.Interface
83+
var (
84+
remediationClient remediation.FaultRemediationClientInterface
85+
clientSet kubernetes.Interface
86+
)
87+
8088
if params.UseCtrlRuntime {
8189
remediationClient, err = remediation.NewCtrlRuntimeRemediationClient(
8290
ctrlruntimeClient,

fault-remediation/pkg/reconciler/reconciler.go

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,14 @@ import (
1818
"context"
1919
"errors"
2020
"fmt"
21-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
22-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
23-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
24-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
25-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2621
"log/slog"
2722
"strings"
2823
"time"
2924

25+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
26+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
27+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
28+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
3029
"k8s.io/client-go/util/workqueue"
3130
ctrl "sigs.k8s.io/controller-runtime"
3231
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -54,7 +53,6 @@ type ReconcilerConfig struct {
5453
EnableLogCollector bool
5554
UpdateMaxRetries int
5655
UpdateRetryDelay time.Duration
57-
test metav1.Time
5856
}
5957

6058
// FaultRemediationReconciler reconciles health events from a datastore change stream
@@ -170,7 +168,11 @@ func (r *FaultRemediationReconciler) shouldSkipEvent(ctx context.Context,
170168
}
171169

172170
// runLogCollector runs log collector for non-NONE actions if enabled
173-
func (r *FaultRemediationReconciler) runLogCollector(ctx context.Context, healthEvent *protos.HealthEvent, eventUID string) (ctrl.Result, error) {
171+
func (r *FaultRemediationReconciler) runLogCollector(
172+
ctx context.Context,
173+
healthEvent *protos.HealthEvent,
174+
eventUID string,
175+
) (ctrl.Result, error) {
174176
if healthEvent.RecommendedAction == protos.RecommendedAction_NONE || !r.Config.EnableLogCollector {
175177
return ctrl.Result{}, nil
176178
}
@@ -204,6 +206,7 @@ func (r *FaultRemediationReconciler) performRemediation(
204206
if err != nil {
205207
slog.Error("Error updating node label to remediating", "error", err)
206208
metrics.ProcessingErrors.WithLabelValues("label_update_error", nodeName).Inc()
209+
207210
return "", err
208211
}
209212

@@ -213,9 +216,11 @@ func (r *FaultRemediationReconciler) performRemediation(
213216
}
214217

215218
remediationLabelValue := statemanager.RemediationSucceededLabelValue
219+
216220
crName, createMaintenanceResourceError := r.Config.RemediationClient.CreateMaintenanceResource(ctx, healthEventData)
217221
if createMaintenanceResourceError != nil {
218222
metrics.ProcessingErrors.WithLabelValues("cr_creation_failed", nodeName).Inc()
223+
219224
remediationLabelValue = statemanager.RemediationFailedLabelValue
220225
// don't throw error yet so we can update state
221226
}
@@ -228,8 +233,10 @@ func (r *FaultRemediationReconciler) performRemediation(
228233
"label", remediationLabelValue,
229234
"error", err)
230235
metrics.ProcessingErrors.WithLabelValues("label_update_error", nodeName).Inc()
236+
231237
return "", errors.Join(createMaintenanceResourceError, err)
232238
}
239+
233240
if createMaintenanceResourceError != nil {
234241
return "", createMaintenanceResourceError
235242
}
@@ -268,6 +275,7 @@ func (r *FaultRemediationReconciler) handleCancellationEvent(
268275
}
269276

270277
// handleRemediationEvent processes remediation for quarantined nodes
278+
// nolint: cyclop // todo
271279
func (r *FaultRemediationReconciler) handleRemediationEvent(
272280
ctx context.Context,
273281
healthEventWithStatus *events.HealthEventDoc,
@@ -283,6 +291,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
283291
if err := watcherInstance.MarkProcessed(ctx, eventWithToken.ResumeToken); err != nil {
284292
metrics.ProcessingErrors.WithLabelValues("mark_processed_error", nodeName).Inc()
285293
slog.Error("Error updating resume token", "error", err)
294+
286295
return ctrl.Result{}, err
287296
}
288297

@@ -307,6 +316,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
307316
if err = watcherInstance.MarkProcessed(ctx, eventWithToken.ResumeToken); err != nil {
308317
metrics.ProcessingErrors.WithLabelValues("mark_processed_error", nodeName).Inc()
309318
slog.Error("Error updating resume token", "error", err)
319+
310320
return ctrl.Result{}, err
311321
}
312322

@@ -320,13 +330,15 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
320330
}
321331

322332
_, performRemediationErr := r.performRemediation(ctx, healthEventWithStatus)
333+
323334
nodeRemediatedStatus := performRemediationErr == nil // success if no error thrown
324335
if err = r.updateNodeRemediatedStatus(ctx, healthEventStore, eventWithToken, nodeRemediatedStatus); err != nil {
325336
metrics.ProcessingErrors.WithLabelValues("update_status_error", nodeName).Inc()
326337
slog.Error("Error updating remediation status for node", "error", err)
327338

328339
return ctrl.Result{}, errors.Join(performRemediationErr, err)
329340
}
341+
330342
if performRemediationErr != nil {
331343
return ctrl.Result{}, performRemediationErr
332344
}
@@ -336,6 +348,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent(
336348
if err = watcherInstance.MarkProcessed(ctx, eventWithToken.ResumeToken); err != nil {
337349
metrics.ProcessingErrors.WithLabelValues("mark_processed_error", nodeName).Inc()
338350
slog.Error("Error updating resume token", "error", err)
351+
339352
return ctrl.Result{}, err
340353
}
341354

fault-remediation/pkg/reconciler/reconciler_e2e_test.go

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,15 @@ package reconciler
1616

1717
import (
1818
"context"
19-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
20-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
21-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
22-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
23-
"k8s.io/client-go/kubernetes/scheme"
19+
2420
"log"
2521
"os"
2622
"path/filepath"
27-
ctrl "sigs.k8s.io/controller-runtime"
2823
"sigs.k8s.io/controller-runtime/pkg/client"
29-
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3024
"sync"
3125
"testing"
3226
"time"
3327

34-
"github.com/nvidia/nvsentinel/data-models/pkg/model"
35-
"github.com/nvidia/nvsentinel/data-models/pkg/protos"
36-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/common"
37-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
38-
"github.com/nvidia/nvsentinel/store-client/pkg/datastore"
39-
"github.com/nvidia/nvsentinel/store-client/pkg/testutils"
40-
41-
"github.com/nvidia/nvsentinel/commons/pkg/statemanager"
4228
"github.com/prometheus/client_golang/prometheus"
4329
dto "github.com/prometheus/client_model/go"
4430
"github.com/stretchr/testify/assert"
@@ -50,17 +36,19 @@ import (
5036
"k8s.io/client-go/kubernetes"
5137
"k8s.io/client-go/kubernetes/scheme"
5238
"k8s.io/client-go/rest"
53-
"k8s.io/client-go/restmapper"
5439
ctrl "sigs.k8s.io/controller-runtime"
5540
"sigs.k8s.io/controller-runtime/pkg/envtest"
5641
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
5742

5843
"github.com/nvidia/nvsentinel/commons/pkg/statemanager"
5944
"github.com/nvidia/nvsentinel/data-models/pkg/model"
6045
"github.com/nvidia/nvsentinel/data-models/pkg/protos"
46+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
6147
"github.com/nvidia/nvsentinel/fault-remediation/pkg/common"
6248
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
63-
"github.com/nvidia/nvsentinel/fault-remediation/pkg/crstatus"
49+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/events"
50+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/metrics"
51+
"github.com/nvidia/nvsentinel/fault-remediation/pkg/remediation"
6452
"github.com/nvidia/nvsentinel/store-client/pkg/datastore"
6553
"github.com/nvidia/nvsentinel/store-client/pkg/testutils"
6654
)

0 commit comments

Comments
 (0)