Skip to content

Commit ee1e212

Browse files
jtschellingclaude
andcommitted
feat(janitor): onboard ExtRR to node-level locking
ExtRR now participates in the cross-controller node-level lock shared by RebootNode / TerminateNode / GPUReset. Without this, fault-remediation could create an ExtRR for a node already being acted on by another maintenance resource (or vice-versa), producing overlapping handoffs. Behavior: * reconcileApply acquires the lock after fetching the Node, before any Node mutation. Contention requeues with the 2s cadence the siblings use. The drift branch explicitly releases the lock (terminal failure; we never owned the taint). * reconcileCleanupAfterComplete (branch 4) releases the lock once Complete=True is observed — the external system finished, the Node is back in NVSentinel's hands. CheckUnlock is idempotent so subsequent no-op cleanup reconciles don't churn it. * reconcileCleanupOnDeletion (branch 2) releases the lock explicitly before finalizer removal. K8s GC via the lease's ownerReference covers the failure case, but the explicit unlock matches the sibling pattern and shortens the window where the lease could outlive its owner. Wiring: * New `NodeLock` + `LockNamespace` fields on the reconciler. * main.go passes `LockNamespace: podNamespace` (matches siblings). * SetupWithManager initialises `distributedlock.NewNodeLock(...)` when unset, so tests can inject their own. * New kubebuilder RBAC marker for `coordination.k8s.io/leases`. Adds an envtest spec that pre-creates a lease with a foreign owner and asserts the apply path requeues with `nodeLockRequeue` and the Node stays untouched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 1319eeb commit ee1e212

3 files changed

Lines changed: 93 additions & 4 deletions

File tree

janitor/main.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,9 @@ func run() error {
240240
}
241241

242242
if err = (&controller.ExternalRemediationRequestReconciler{
243-
Client: mgr.GetClient(),
244-
Scheme: mgr.GetScheme(),
243+
Client: mgr.GetClient(),
244+
Scheme: mgr.GetScheme(),
245+
LockNamespace: podNamespace,
245246
}).SetupWithManager(mgr); err != nil {
246247
slog.Error("unable to create controller", "controller", "ExternalRemediationRequest", "error", err)
247248

janitor/pkg/controller/externalremediationrequest_controller.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
protos "github.com/nvidia/nvsentinel/data-models/pkg/protos"
4343
nvsentinelv1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1"
4444
"github.com/nvidia/nvsentinel/janitor/pkg/condition"
45+
"github.com/nvidia/nvsentinel/janitor/pkg/distributedlock"
4546
"github.com/nvidia/nvsentinel/janitor/pkg/metrics"
4647
)
4748

@@ -78,8 +79,10 @@ const (
7879
// ExternalRemediationRequestReconciler implements the ADR-040 state machine.
7980
type ExternalRemediationRequestReconciler struct {
8081
client.Client
81-
Scheme *runtime.Scheme
82-
Recorder record.EventRecorder
82+
Scheme *runtime.Scheme
83+
Recorder record.EventRecorder
84+
NodeLock distributedlock.NodeLock
85+
LockNamespace string
8386
}
8487

8588
const labelValueUnknown = "unknown"
@@ -112,6 +115,7 @@ func (r *ExternalRemediationRequestReconciler) emitEvent(
112115
// +kubebuilder:rbac:groups=nvsentinel.dgxc.nvidia.com,resources=externalremediationrequests/status,verbs=get;update;patch
113116
// +kubebuilder:rbac:groups=nvsentinel.dgxc.nvidia.com,resources=externalremediationrequests/finalizers,verbs=update
114117
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;patch
118+
// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;delete
115119

116120
// Reconcile drives the ExtRR through its lifecycle. The OTEL span is linked to
117121
// the upstream health-monitor trace via annotations stamped by fault-remediation.
@@ -267,11 +271,20 @@ func (r *ExternalRemediationRequestReconciler) dispatch(
267271
// nodeMissingRequeue covers the cluster-autoscaler / kubelet-registration race.
268272
const nodeMissingRequeue = 30 * time.Second
269273

274+
// nodeLockRequeue matches the sibling janitor controllers' lock-retry cadence.
275+
const nodeLockRequeue = 2 * time.Second
276+
270277
// reconcileApply applies the release taint + managed=false in one PATCH then
271278
// transitions NVSentinelOwnershipReleased. Failure modes per ADR-040: drift
272279
// → terminal False; Node not found → transient requeue; already-applied →
273280
// idempotent fast path. Other API errors propagate and controller-runtime
274281
// requeues with backoff.
282+
//
283+
// Acquires the cross-controller node-level lock before any node mutation so a
284+
// concurrent RebootNode / TerminateNode / GPUReset can't act on the same node.
285+
// The lock is released either explicitly on drift (terminal failure) and on
286+
// Complete=True cleanup, or implicitly via the lease's ownerReference when the
287+
// ExtRR is deleted.
275288
func (r *ExternalRemediationRequestReconciler) reconcileApply(
276289
ctx context.Context, extrrObj *nvsentinelv1.ExternalRemediationRequest,
277290
) (ctrl.Result, error) {
@@ -289,13 +302,22 @@ func (r *ExternalRemediationRequestReconciler) reconcileApply(
289302
return ctrl.Result{}, fmt.Errorf("get node %q for ExtRR %q: %w", nodeName, extrrObj.Name, err)
290303
}
291304

305+
if !r.NodeLock.LockNode(ctx, extrrObj, nodeName) {
306+
slog.InfoContext(ctx, "another maintenance resource holds the node lock; requeueing",
307+
"extrr", extrrObj.Name, "node", nodeName)
308+
309+
return ctrl.Result{RequeueAfter: nodeLockRequeue}, nil
310+
}
311+
292312
if existing := findTaintByKey(node.Spec.Taints, ReleaseTaintKey); existing != nil {
293313
if existing.Value != extrrObj.Name {
294314
msg := fmt.Sprintf(
295315
"node %q already tainted by ExternalRemediationRequest %q; another ExtRR owns this node",
296316
nodeName, existing.Value)
297317
slog.WarnContext(ctx, "release taint drift detected",
298318
"extrr", extrrObj.Name, "node", nodeName, "existing_owner", existing.Value)
319+
// Release the lock — drift is terminal and we never owned the taint.
320+
r.NodeLock.CheckUnlock(ctx, extrrObj, nodeName)
299321

300322
return ctrl.Result{}, r.transitionToReleaseFailure(ctx, extrrObj, msg)
301323
}
@@ -400,6 +422,10 @@ func (r *ExternalRemediationRequestReconciler) reconcileCleanupAfterComplete(
400422
metrics.GlobalMetrics.IncExtRRTotal(metrics.ExtRRPhaseExternalResponse, metrics.ExtRRResultSuccess)
401423
}
402424

425+
// Release the node-level lock — the external system has finished and the
426+
// node is back in NVSentinel's hands. CheckUnlock is idempotent.
427+
r.NodeLock.CheckUnlock(ctx, extrrObj, extrrObj.Spec.HealthEvent.NodeName)
428+
403429
return ctrl.Result{}, nil
404430
}
405431

@@ -425,6 +451,12 @@ func (r *ExternalRemediationRequestReconciler) reconcileCleanupOnDeletion(
425451
r.recordClose(extrrObj, metrics.ExtRRResultOperatorDeleted, closeReasonOperatorInitiated)
426452
}
427453

454+
// Release the lock before removing the finalizer. K8s GC via the lease's
455+
// ownerReference covers the failure case, but the explicit unlock keeps
456+
// us consistent with the sibling controllers and shortens the window
457+
// where the lease could outlive its owner.
458+
r.NodeLock.CheckUnlock(ctx, extrrObj, extrrObj.Spec.HealthEvent.NodeName)
459+
428460
updated := extrrObj.DeepCopy()
429461
controllerutil.RemoveFinalizer(updated, ExternalRemediationFinalizer)
430462

@@ -635,6 +667,10 @@ func (r *ExternalRemediationRequestReconciler) SetupWithManager(mgr ctrl.Manager
635667
r.Recorder = mgr.GetEventRecorderFor("externalremediationrequest-controller")
636668
}
637669

670+
if r.NodeLock == nil {
671+
r.NodeLock = distributedlock.NewNodeLock(mgr.GetClient(), r.LockNamespace)
672+
}
673+
638674
if err := mgr.GetFieldIndexer().IndexField(
639675
context.Background(),
640676
&nvsentinelv1.ExternalRemediationRequest{},

janitor/pkg/controller/externalremediationrequest_controller_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/stretchr/testify/assert"
2727
"github.com/prometheus/client_golang/prometheus/testutil"
2828
"google.golang.org/protobuf/types/known/timestamppb"
29+
coordinationv1 "k8s.io/api/coordination/v1"
2930
corev1 "k8s.io/api/core/v1"
3031
apierrors "k8s.io/apimachinery/pkg/api/errors"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -39,6 +40,7 @@ import (
3940
"github.com/nvidia/nvsentinel/commons/pkg/managed"
4041
protos "github.com/nvidia/nvsentinel/data-models/pkg/protos"
4142
nvsentinelv1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1"
43+
"github.com/nvidia/nvsentinel/janitor/pkg/distributedlock"
4244
janitormetrics "github.com/nvidia/nvsentinel/janitor/pkg/metrics"
4345
)
4446

@@ -53,6 +55,7 @@ func newExtRRReconciler() *ExternalRemediationRequestReconciler {
5355
Client: c,
5456
Scheme: scheme.Scheme,
5557
Recorder: record.NewFakeRecorder(64),
58+
NodeLock: distributedlock.NewNodeLock(c, testExtRRNamespace),
5659
}
5760
}
5861

@@ -819,6 +822,55 @@ var _ = Describe("ExternalRemediationRequest Controller apply path (branch 3)",
819822
Expect(node.Labels).NotTo(HaveKey(managed.ManagedLabelKey), "drift case must NOT set managed=false")
820823
})
821824

825+
It("requeues without acting when another maintenance resource holds the node lock", func() {
826+
nodeName := "node-locked-1"
827+
Expect(r.Client.Create(ctx, newTestNode(nodeName, nil, nil))).To(Succeed())
828+
DeferCleanup(deleteNodeForCleanup, ctx, r, nodeName)
829+
830+
// Pre-create a lease as if a sibling maintenance CR (e.g. RebootNode)
831+
// already holds the lock for this node.
832+
foreignLease := &coordinationv1.Lease{
833+
ObjectMeta: metav1.ObjectMeta{
834+
Name: nodeName,
835+
Namespace: testExtRRNamespace,
836+
OwnerReferences: []metav1.OwnerReference{{
837+
APIVersion: "janitor.dgxc.nvidia.com/v1alpha1",
838+
Kind: "RebootNode",
839+
Name: "foreign-reboot",
840+
UID: "foreign-uid",
841+
}},
842+
},
843+
}
844+
Expect(r.Client.Create(ctx, foreignLease)).To(Succeed())
845+
DeferCleanup(func() { _ = r.Client.Delete(ctx, foreignLease) })
846+
847+
extrrObj := newTestExtRR("locked-extrr-1", nodeName)
848+
Expect(r.Client.Create(ctx, extrrObj)).To(Succeed())
849+
DeferCleanup(deleteExtRRForCleanup, ctx, r, extrrObj)
850+
851+
key := ctrlclient.ObjectKey{Name: extrrObj.Name, Namespace: extrrObj.Namespace}
852+
// Init completes (no Node interaction); second reconcile hits apply,
853+
// finds the foreign lock, requeues without acting.
854+
_, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: key})
855+
Expect(err).NotTo(HaveOccurred())
856+
857+
result, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: key})
858+
Expect(err).NotTo(HaveOccurred(), "lock contention must not propagate as a reconcile error")
859+
Expect(result.RequeueAfter).To(Equal(nodeLockRequeue), "lock contention must requeue with the lock-retry cadence")
860+
861+
// Released stays Unknown — apply never ran.
862+
var got nvsentinelv1.ExternalRemediationRequest
863+
Expect(r.Client.Get(ctx, key, &got)).To(Succeed())
864+
released := findExtRRCondition(&got, ConditionNVSentinelOwnershipReleased)
865+
Expect(released.Status).To(Equal("Unknown"))
866+
867+
// And the Node is untouched.
868+
var node corev1.Node
869+
Expect(r.Client.Get(ctx, ctrlclient.ObjectKey{Name: nodeName}, &node)).To(Succeed())
870+
Expect(findTaintByKey(node.Spec.Taints, ReleaseTaintKey)).To(BeNil())
871+
Expect(node.Labels).NotTo(HaveKey(managed.ManagedLabelKey))
872+
})
873+
822874
})
823875

824876
func TestExtRRReconciler_NeedsInitialization(t *testing.T) {

0 commit comments

Comments
 (0)