Skip to content

Commit 1b3ff7f

Browse files
author
Igor Velichkovich
committed
fix(remediation): retry on errors and throw errors to trigger retries
Signed-off-by: Igor Velichkovich <[email protected]>
1 parent 4f3d5b1 commit 1b3ff7f

27 files changed

+2437
-657
lines changed

.gitignore

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -140,30 +140,7 @@ dist/
140140
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
141141
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
142142
# User-specific stuff
143-
.idea/**/workspace.xml
144-
.idea/**/tasks.xml
145-
.idea/**/usage.statistics.xml
146-
.idea/**/dictionaries
147-
.idea/**/shelf
148-
149-
# AWS User-specific
150-
.idea/**/aws.xml
151-
152-
# Generated files
153-
.idea/**/contentModel.xml
154-
155-
# Sensitive or high-churn files
156-
.idea/**/dataSources/
157-
.idea/**/dataSources.ids
158-
.idea/**/dataSources.local.xml
159-
.idea/**/sqlDataSources.xml
160-
.idea/**/dynamic.xml
161-
.idea/**/uiDesigner.xml
162-
.idea/**/dbnavigator.xml
163-
164-
# Gradle
165-
.idea/**/gradle.xml
166-
.idea/**/libraries
143+
.idea/
167144

168145
# CMake
169146
cmake-build-*/

commons/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ require (
2323
github.com/cespare/xxhash/v2 v2.3.0 // indirect
2424
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
2525
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
26+
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
2627
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
2728
github.com/go-logr/logr v1.4.3 // indirect
2829
github.com/go-openapi/jsonpointer v0.22.3 // indirect

commons/go.sum

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
1313
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
1414
github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes=
1515
github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
16+
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
17+
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
1618
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
1719
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
1820
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
@@ -21,6 +23,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
2123
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
2224
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
2325
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
26+
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
27+
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
2428
github.com/go-openapi/jsonpointer v0.22.3 h1:dKMwfV4fmt6Ah90zloTbUKWMD+0he+12XYAsPotrkn8=
2529
github.com/go-openapi/jsonpointer v0.22.3/go.mod h1:0lBbqeRsQ5lIanv3LHZBrmRGHLHcQoOXQnf88fHlGWo=
2630
github.com/go-openapi/jsonreference v0.21.3 h1:96Dn+MRPa0nYAR8DR1E03SblB5FJvh7W6krPI0Z7qMc=
@@ -57,6 +61,8 @@ github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6
5761
github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
5862
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
5963
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
64+
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
65+
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
6066
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
6167
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
6268
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
@@ -143,6 +149,10 @@ go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJr
143149
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
144150
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
145151
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
152+
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
153+
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
154+
go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
155+
go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
146156
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
147157
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
148158
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
@@ -188,6 +198,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
188198
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
189199
k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY=
190200
k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA=
201+
k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW71y40g=
202+
k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0=
191203
k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8=
192204
k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns=
193205
k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE=

commons/pkg/statemanager/statemanager.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ package statemanager
144144
import (
145145
"context"
146146
"fmt"
147+
corev1 "k8s.io/api/core/v1"
148+
"k8s.io/apimachinery/pkg/types"
147149
"log/slog"
150+
"sigs.k8s.io/controller-runtime/pkg/client"
148151

149152
"k8s.io/apimachinery/pkg/api/errors"
150153
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -292,6 +295,98 @@ func (manager *stateManager) UpdateNVSentinelStateNodeLabel(ctx context.Context,
292295
return nodeModified, err
293296
}
294297

298+
type ctrlRuntimeStateManager struct {
299+
client client.Client
300+
}
301+
302+
func NewCtrlRuntimeStateManager(client client.Client) StateManager {
303+
return &ctrlRuntimeStateManager{
304+
client: client,
305+
}
306+
}
307+
308+
// UpdateNVSentinelStateNodeLabel will update the given node to the given value for the dgxc.nvidia.com/nvsentinel-state
309+
// label or it will remove the given label if removeStateLabel is true.
310+
func (manager *ctrlRuntimeStateManager) UpdateNVSentinelStateNodeLabel(
311+
ctx context.Context,
312+
nodeName string,
313+
newStateLabelValue NVSentinelStateLabelValue,
314+
removeStateLabel bool,
315+
) (bool, error) {
316+
node := &corev1.Node{}
317+
err := manager.client.Get(ctx, types.NamespacedName{
318+
Name: nodeName,
319+
}, node)
320+
if err != nil {
321+
return false, err
322+
}
323+
324+
currentValue, exists := node.Labels[NVSentinelStateLabelKey]
325+
326+
if removeStateLabel {
327+
if !exists {
328+
slog.Info("Label already absent",
329+
"node", nodeName,
330+
"label", NVSentinelStateLabelKey)
331+
332+
return false, nil
333+
}
334+
335+
delete(node.Labels, NVSentinelStateLabelKey)
336+
337+
err = manager.client.Update(ctx, node)
338+
if err != nil {
339+
return false, fmt.Errorf("failed to update node %s to remove label: %w", nodeName, err)
340+
}
341+
342+
slog.Info("Label removed successfully for node",
343+
"label", NVSentinelStateLabelKey,
344+
"node", nodeName)
345+
346+
return true, nil
347+
}
348+
349+
slog.Info("Labeling node", "node", nodeName, "from", currentValue, "to", newStateLabelValue)
350+
351+
if exists && currentValue == string(newStateLabelValue) {
352+
slog.Info("No update needed for node", "node", nodeName, "label", NVSentinelStateLabelKey,
353+
"value", newStateLabelValue)
354+
355+
return false, nil
356+
}
357+
358+
// Check for unexpected state transitions (for observability)
359+
// We'll return the error AFTER updating the label, so callers can emit error metrics
360+
// while still having the label reflect what modules are actually doing
361+
validationErr := validateStateTransition(nodeName, currentValue, exists, newStateLabelValue)
362+
if validationErr != nil {
363+
slog.Warn("Invalid state transition", "node", nodeName,
364+
"from", currentValue, "to", newStateLabelValue, "error", validationErr)
365+
}
366+
if node.Labels == nil {
367+
node.Labels = map[string]string{}
368+
}
369+
node.Labels[NVSentinelStateLabelKey] = string(newStateLabelValue)
370+
371+
// Update the node (this happens regardless of validation result)
372+
err = manager.client.Update(ctx, node)
373+
if err != nil {
374+
return false, fmt.Errorf("failed to update node %s with new label: %w", nodeName, err)
375+
}
376+
377+
slog.Info("Label updated successfully for node",
378+
"label", NVSentinelStateLabelKey,
379+
"node", nodeName)
380+
381+
// Return validation error AFTER successful label update
382+
// This allows callers to emit error metrics while the label reflects reality
383+
if validationErr != nil {
384+
return true, validationErr
385+
}
386+
387+
return true, err
388+
}
389+
295390
// validateStateTransition detects unexpected state transitions for observability.
296391
// Returns an error for unexpected transitions, but the caller updates the label anyway.
297392
// This allows callers to emit error metrics while still reflecting what modules are actually doing.

0 commit comments

Comments
 (0)