@@ -144,7 +144,10 @@ package statemanager
144144import (
145145 "context"
146146 "fmt"
147+ corev1 "k8s.io/api/core/v1"
148+ "k8s.io/apimachinery/pkg/types"
147149 "log/slog"
150+ "sigs.k8s.io/controller-runtime/pkg/client"
148151
149152 "k8s.io/apimachinery/pkg/api/errors"
150153 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -292,6 +295,98 @@ func (manager *stateManager) UpdateNVSentinelStateNodeLabel(ctx context.Context,
292295 return nodeModified , err
293296}
294297
298+ type ctrlRuntimeStateManager struct {
299+ client client.Client
300+ }
301+
302+ func NewCtrlRuntimeStateManager (client client.Client ) StateManager {
303+ return & ctrlRuntimeStateManager {
304+ client : client ,
305+ }
306+ }
307+
308+ // UpdateNVSentinelStateNodeLabel will update the given node to the given value for the dgxc.nvidia.com/nvsentinel-state
309+ // label or it will remove the given label if removeStateLabel is true.
310+ func (manager * ctrlRuntimeStateManager ) UpdateNVSentinelStateNodeLabel (
311+ ctx context.Context ,
312+ nodeName string ,
313+ newStateLabelValue NVSentinelStateLabelValue ,
314+ removeStateLabel bool ,
315+ ) (bool , error ) {
316+ node := & corev1.Node {}
317+ err := manager .client .Get (ctx , types.NamespacedName {
318+ Name : nodeName ,
319+ }, node )
320+ if err != nil {
321+ return false , err
322+ }
323+
324+ currentValue , exists := node .Labels [NVSentinelStateLabelKey ]
325+
326+ if removeStateLabel {
327+ if ! exists {
328+ slog .Info ("Label already absent" ,
329+ "node" , nodeName ,
330+ "label" , NVSentinelStateLabelKey )
331+
332+ return false , nil
333+ }
334+
335+ delete (node .Labels , NVSentinelStateLabelKey )
336+
337+ err = manager .client .Update (ctx , node )
338+ if err != nil {
339+ return false , fmt .Errorf ("failed to update node %s to remove label: %w" , nodeName , err )
340+ }
341+
342+ slog .Info ("Label removed successfully for node" ,
343+ "label" , NVSentinelStateLabelKey ,
344+ "node" , nodeName )
345+
346+ return true , nil
347+ }
348+
349+ slog .Info ("Labeling node" , "node" , nodeName , "from" , currentValue , "to" , newStateLabelValue )
350+
351+ if exists && currentValue == string (newStateLabelValue ) {
352+ slog .Info ("No update needed for node" , "node" , nodeName , "label" , NVSentinelStateLabelKey ,
353+ "value" , newStateLabelValue )
354+
355+ return false , nil
356+ }
357+
358+ // Check for unexpected state transitions (for observability)
359+ // We'll return the error AFTER updating the label, so callers can emit error metrics
360+ // while still having the label reflect what modules are actually doing
361+ validationErr := validateStateTransition (nodeName , currentValue , exists , newStateLabelValue )
362+ if validationErr != nil {
363+ slog .Warn ("Invalid state transition" , "node" , nodeName ,
364+ "from" , currentValue , "to" , newStateLabelValue , "error" , validationErr )
365+ }
366+ if node .Labels == nil {
367+ node .Labels = map [string ]string {}
368+ }
369+ node .Labels [NVSentinelStateLabelKey ] = string (newStateLabelValue )
370+
371+ // Update the node (this happens regardless of validation result)
372+ err = manager .client .Update (ctx , node )
373+ if err != nil {
374+ return false , fmt .Errorf ("failed to update node %s with new label: %w" , nodeName , err )
375+ }
376+
377+ slog .Info ("Label updated successfully for node" ,
378+ "label" , NVSentinelStateLabelKey ,
379+ "node" , nodeName )
380+
381+ // Return validation error AFTER successful label update
382+ // This allows callers to emit error metrics while the label reflects reality
383+ if validationErr != nil {
384+ return true , validationErr
385+ }
386+
387+ return true , err
388+ }
389+
295390// validateStateTransition detects unexpected state transitions for observability.
296391// Returns an error for unexpected transitions, but the caller updates the label anyway.
297392// This allows callers to emit error metrics while still reflecting what modules are actually doing.
0 commit comments