Skip to content

Commit 51bdbaa

Browse files
jtschellingclaude
andcommitted
refactor(janitor): drop the Node watch + field indexer + predicate
The Node watch existed to self-heal during apply (if a Node update landed mid-apply we'd re-enqueue and try again) and to detect drift on released nodes. Both reasons are gone: * The apply path is one-shot under the node-level lock — we either succeed and transition Released=True, or the API call propagates an error and controller-runtime requeues. There's no "we needed another watch fire to make progress" case. * Post-release, the external system owns the Node's release state. ADR-040 is explicit that we do NOT re-take ownership if the external system strips the taint, and we no longer treat drift as terminal, so re-enqueueing on Node updates is a no-op. The watch was firing on every Node taint/label change just to land in the dispatcher's no-op branch. Net negative: pointless reconcile churn, 40 lines of mapper/predicate/indexer plumbing, an extra RBAC scope implication. Removed: * `Watches(&corev1.Node{}, ...)` from SetupWithManager * `mapNodeToExtRRs`, `nodeReleaseStateChangedPredicate`, `indexExtRRByNodeName`, `extrrNodeNameIndexKey` * The field-indexer registration + `mgr.GetFieldIndexer()` call * `externalremediationrequest_setup_test.go` (the dedicated test file for the indexer/predicate/mapper) * Unused imports: `builder`, `handler`, `predicate`, `event` Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 295d97f commit 51bdbaa

2 files changed

Lines changed: 0 additions & 300 deletions

File tree

janitor/pkg/controller/externalremediationrequest_controller.go

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,8 @@ import (
2929
"k8s.io/apimachinery/pkg/runtime"
3030
"k8s.io/client-go/tools/record"
3131
ctrl "sigs.k8s.io/controller-runtime"
32-
"sigs.k8s.io/controller-runtime/pkg/builder"
3332
"sigs.k8s.io/controller-runtime/pkg/client"
3433
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
35-
"sigs.k8s.io/controller-runtime/pkg/event"
36-
"sigs.k8s.io/controller-runtime/pkg/handler"
37-
"sigs.k8s.io/controller-runtime/pkg/predicate"
3834

3935
"github.com/nvidia/nvsentinel/commons/pkg/managed"
4036
"github.com/nvidia/nvsentinel/commons/pkg/tracing"
@@ -70,8 +66,6 @@ const (
7066
// in the ReleaseTaintRemoved event message.
7167
closeReasonExternalRemediationCompleteTrue = "ExternalRemediationCompleteTrue"
7268
closeReasonOperatorInitiated = "OperatorInitiated"
73-
74-
extrrNodeNameIndexKey = "spec.healthEvent.nodeName"
7569
)
7670

7771
// ExternalRemediationRequestReconciler implements the ADR-040 state machine.
@@ -613,8 +607,6 @@ func (r *ExternalRemediationRequestReconciler) patchStatusConditions(
613607
return true, nil
614608
}
615609

616-
// SetupWithManager wires the controller, the Node watch that re-enqueues on
617-
// taint/label drift, and the field indexer on spec.healthEvent.nodeName.
618610
func (r *ExternalRemediationRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
619611
if r.Recorder == nil {
620612
// nolint:staticcheck // SA1019: project-wide events.k8s.io migration pending.
@@ -625,101 +617,8 @@ func (r *ExternalRemediationRequestReconciler) SetupWithManager(mgr ctrl.Manager
625617
r.NodeLock = distributedlock.NewNodeLock(mgr.GetClient(), r.LockNamespace)
626618
}
627619

628-
if err := mgr.GetFieldIndexer().IndexField(
629-
context.Background(),
630-
&nvsentinelv1.ExternalRemediationRequest{},
631-
extrrNodeNameIndexKey,
632-
indexExtRRByNodeName,
633-
); err != nil {
634-
return fmt.Errorf("registering ExtRR node-name field indexer: %w", err)
635-
}
636-
637620
return ctrl.NewControllerManagedBy(mgr).
638621
For(&nvsentinelv1.ExternalRemediationRequest{}).
639-
Watches(
640-
&corev1.Node{},
641-
handler.EnqueueRequestsFromMapFunc(r.mapNodeToExtRRs),
642-
builder.WithPredicates(nodeReleaseStateChangedPredicate()),
643-
).
644622
Named("externalremediationrequest").
645623
Complete(r)
646624
}
647-
648-
// indexExtRRByNodeName returns nil for malformed ExtRRs so a webhook bypass
649-
// can't crash the indexer.
650-
func indexExtRRByNodeName(o client.Object) []string {
651-
extrr, ok := o.(*nvsentinelv1.ExternalRemediationRequest)
652-
if !ok || extrr.Spec == nil || extrr.Spec.HealthEvent == nil || extrr.Spec.HealthEvent.NodeName == "" {
653-
return nil
654-
}
655-
656-
return []string{extrr.Spec.HealthEvent.NodeName}
657-
}
658-
659-
// nodeReleaseStateChangedPredicate drops Node updates outside the release
660-
// surface (managed label + release taint), so kubelet heartbeats don't
661-
// re-enqueue every ExtRR on every sync.
662-
func nodeReleaseStateChangedPredicate() predicate.Predicate {
663-
return predicate.Funcs{
664-
CreateFunc: func(_ event.CreateEvent) bool { return true },
665-
DeleteFunc: func(_ event.DeleteEvent) bool { return true },
666-
GenericFunc: func(_ event.GenericEvent) bool { return true },
667-
UpdateFunc: func(e event.UpdateEvent) bool {
668-
oldNode, ok := e.ObjectOld.(*corev1.Node)
669-
if !ok {
670-
return false
671-
}
672-
673-
newNode, ok := e.ObjectNew.(*corev1.Node)
674-
if !ok {
675-
return false
676-
}
677-
678-
if oldNode.Labels[managed.ManagedLabelKey] != newNode.Labels[managed.ManagedLabelKey] {
679-
return true
680-
}
681-
682-
oldTaint := findTaintByKey(oldNode.Spec.Taints, ReleaseTaintKey)
683-
newTaint := findTaintByKey(newNode.Spec.Taints, ReleaseTaintKey)
684-
685-
if (oldTaint == nil) != (newTaint == nil) {
686-
return true
687-
}
688-
689-
if oldTaint != nil && newTaint != nil && oldTaint.Value != newTaint.Value {
690-
return true
691-
}
692-
693-
return false
694-
},
695-
}
696-
}
697-
698-
// mapNodeToExtRRs uses the field indexer so each lookup is bounded by the
699-
// ExtRRs targeting this Node, not the total ExtRR population.
700-
func (r *ExternalRemediationRequestReconciler) mapNodeToExtRRs(ctx context.Context, obj client.Object) []ctrl.Request {
701-
node, ok := obj.(*corev1.Node)
702-
if !ok {
703-
return nil
704-
}
705-
706-
var extrrs nvsentinelv1.ExternalRemediationRequestList
707-
if err := r.List(ctx, &extrrs, client.MatchingFields{extrrNodeNameIndexKey: node.Name}); err != nil {
708-
slog.ErrorContext(ctx, "listing ExternalRemediationRequests by node-name index",
709-
"node", node.Name, "error", err)
710-
711-
return nil
712-
}
713-
714-
requests := make([]ctrl.Request, 0, len(extrrs.Items))
715-
for i := range extrrs.Items {
716-
requests = append(requests, ctrl.Request{
717-
NamespacedName: client.ObjectKey{
718-
Name: extrrs.Items[i].Name,
719-
Namespace: extrrs.Items[i].Namespace,
720-
},
721-
})
722-
}
723-
724-
return requests
725-
}

janitor/pkg/controller/externalremediationrequest_setup_test.go

Lines changed: 0 additions & 199 deletions
This file was deleted.

0 commit comments

Comments
 (0)