Skip to content

Commit 295d97f

Browse files
jtschellingclaude
andcommitted
refactor(janitor): drop drift handling now that node-level locking is in place
The cross-controller node lock prevents two ExtRRs (or any other maintenance CR) from acting on the same node concurrently, so the "foreign taint at our key" scenario is unreachable in production. The ownership check was useful before the lock landed; it's now dead code that bloats the state machine. Removed: * The value-match drift detection in reconcileApply (Released=False terminal failure path). * The drift-safe branch in reconcileCleanup — under the lock, any taint at our key is ours, so unconditionally remove it. * `transitionToReleaseFailure` helper (no callers). * `ReasonReleaseTaintFailed`, `eventReasonReleaseTaintFailed`, `ExtRRResultFailure`, `ExtRROpenStateFailed` constants (no users). * Three envtests covering the drift paths (apply, cleanup, observability). * TestExtRRForeignTaintDrift e2e — the scenario can't occur under the lock contract. Updated: * reconcileApply doc — no longer mentions "drift → terminal False"; the only remaining transient failure is "Node not found → requeue". * reconcileCleanup doc — explains the lock invariant that justifies the simplified taint-removal. * ExtRRTotal phase comment — released and external_response are success-only now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ee1e212 commit 295d97f

4 files changed

Lines changed: 21 additions & 233 deletions

File tree

janitor/pkg/controller/externalremediationrequest_controller.go

Lines changed: 19 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,8 @@ const (
6161
ReleaseTaintKey = "nvsentinel.dgxc.nvidia.com/external-remediation"
6262

6363
ReasonReleaseTaintApplied = "ReleaseTaintApplied"
64-
ReasonReleaseTaintFailed = "ReleaseTaintFailed"
6564

6665
eventReasonReleaseTaintApplied = "ReleaseTaintApplied"
67-
eventReasonReleaseTaintFailed = "ReleaseTaintFailed"
6866
eventReasonReleaseTaintRemoved = "ReleaseTaintRemoved"
6967
eventReasonOperatorDeleteRequest = "OperatorDeleteRequested"
7068

@@ -275,16 +273,14 @@ const nodeMissingRequeue = 30 * time.Second
275273
const nodeLockRequeue = 2 * time.Second
276274

277275
// reconcileApply applies the release taint + managed=false in one PATCH then
278-
// transitions NVSentinelOwnershipReleased. Failure modes per ADR-040: drift
279-
// → terminal False; Node not found → transient requeue; already-applied →
280-
// idempotent fast path. Other API errors propagate and controller-runtime
281-
// requeues with backoff.
276+
// transitions NVSentinelOwnershipReleased=True. Node not found → transient
277+
// requeue; already-applied → idempotent fast path. Other API errors propagate
278+
// and controller-runtime requeues with backoff.
282279
//
283280
// Acquires the cross-controller node-level lock before any node mutation so a
284281
// concurrent RebootNode / TerminateNode / GPUReset can't act on the same node.
285-
// The lock is released either explicitly on drift (terminal failure) and on
286-
// Complete=True cleanup, or implicitly via the lease's ownerReference when the
287-
// ExtRR is deleted.
282+
// The lock is released on Complete=True cleanup (branch 4) and on operator
283+
// delete (branch 2); the lease's ownerReference catches the failure case.
288284
func (r *ExternalRemediationRequestReconciler) reconcileApply(
289285
ctx context.Context, extrrObj *nvsentinelv1.ExternalRemediationRequest,
290286
) (ctrl.Result, error) {
@@ -309,29 +305,16 @@ func (r *ExternalRemediationRequestReconciler) reconcileApply(
309305
return ctrl.Result{RequeueAfter: nodeLockRequeue}, nil
310306
}
311307

312-
if existing := findTaintByKey(node.Spec.Taints, ReleaseTaintKey); existing != nil {
313-
if existing.Value != extrrObj.Name {
314-
msg := fmt.Sprintf(
315-
"node %q already tainted by ExternalRemediationRequest %q; another ExtRR owns this node",
316-
nodeName, existing.Value)
317-
slog.WarnContext(ctx, "release taint drift detected",
318-
"extrr", extrrObj.Name, "node", nodeName, "existing_owner", existing.Value)
319-
// Release the lock — drift is terminal and we never owned the taint.
320-
r.NodeLock.CheckUnlock(ctx, extrrObj, nodeName)
321-
322-
return ctrl.Result{}, r.transitionToReleaseFailure(ctx, extrrObj, msg)
323-
}
324-
// Idempotent fast path — no PATCH needed if both taint and label match.
325-
if node.Labels[managed.ManagedLabelKey] == managed.ManagedLabelValueFalse {
326-
slog.InfoContext(ctx, "release taint and managed=false label already in place; transitioning condition",
327-
"extrr", extrrObj.Name, "node", nodeName)
308+
// We hold the lock, so any taint at our key is ours.
309+
if findTaintByKey(node.Spec.Taints, ReleaseTaintKey) != nil &&
310+
node.Labels[managed.ManagedLabelKey] == managed.ManagedLabelValueFalse {
311+
slog.InfoContext(ctx, "release taint and managed=false label already in place; transitioning condition",
312+
"extrr", extrrObj.Name, "node", nodeName)
328313

329-
msg := fmt.Sprintf("release taint %s=%s and managed=false label already present on node %q",
330-
ReleaseTaintKey, extrrObj.Name, nodeName)
314+
msg := fmt.Sprintf("release taint %s=%s and managed=false label already present on node %q",
315+
ReleaseTaintKey, extrrObj.Name, nodeName)
331316

332-
return ctrl.Result{}, r.transitionToReleaseSuccess(ctx, extrrObj, msg)
333-
}
334-
// Taint matches but label is missing — fall through to patch the label.
317+
return ctrl.Result{}, r.transitionToReleaseSuccess(ctx, extrrObj, msg)
335318
}
336319

337320
nodeToUpdate := node.DeepCopy()
@@ -385,26 +368,6 @@ func (r *ExternalRemediationRequestReconciler) transitionToReleaseSuccess(
385368
return nil
386369
}
387370

388-
// transitionToReleaseFailure records terminal drift failure. Skips ExtRROpen
389-
// — these ExtRRs aren't in-flight, just counted under released{failure}.
390-
func (r *ExternalRemediationRequestReconciler) transitionToReleaseFailure(
391-
ctx context.Context, extrrObj *nvsentinelv1.ExternalRemediationRequest, message string,
392-
) error {
393-
changed, err := r.transitionReleased(ctx, extrrObj, metav1.ConditionFalse, ReasonReleaseTaintFailed, message)
394-
if err != nil {
395-
return err
396-
}
397-
398-
if !changed {
399-
return nil
400-
}
401-
402-
metrics.GlobalMetrics.IncExtRRTotal(metrics.ExtRRPhaseReleased, metrics.ExtRRResultFailure)
403-
r.emitEvent(extrrObj, corev1.EventTypeWarning, eventReasonReleaseTaintFailed, message)
404-
405-
return nil
406-
}
407-
408371
// reconcileCleanupAfterComplete scrubs the Node when Complete=True; the ExtRR
409372
// stays as a historical record until an operator deletes it.
410373
func (r *ExternalRemediationRequestReconciler) reconcileCleanupAfterComplete(
@@ -491,11 +454,9 @@ func (r *ExternalRemediationRequestReconciler) recordClose(
491454
fmt.Sprintf("release taint and managed=false label removed (%s)", closeReason))
492455
}
493456

494-
// reconcileCleanup removes the release taint only when its value matches this
495-
// ExtRR (drift-safe — a foreign taint is left alone) and deletes the managed
496-
// label. Returns true when the PATCH mutated the Node.
497-
//
498-
//nolint:cyclop // dispatch over drift cases; inline switch reads cleaner.
457+
// reconcileCleanup removes the release taint and the managed label from the
458+
// Node. Returns true when the PATCH mutated the Node. The node-level lock
459+
// guarantees we own the taint at our key, so no value-match check is needed.
499460
func (r *ExternalRemediationRequestReconciler) reconcileCleanup(
500461
ctx context.Context, extrrObj *nvsentinelv1.ExternalRemediationRequest,
501462
) (bool, error) {
@@ -516,16 +477,9 @@ func (r *ExternalRemediationRequestReconciler) reconcileCleanup(
516477
nodeToUpdate := node.DeepCopy()
517478
changed := false
518479

519-
if existing := findTaintByKey(nodeToUpdate.Spec.Taints, ReleaseTaintKey); existing != nil {
520-
switch existing.Value {
521-
case extrrObj.Name:
522-
nodeToUpdate.Spec.Taints = removeTaintByKey(nodeToUpdate.Spec.Taints, ReleaseTaintKey)
523-
changed = true
524-
default:
525-
// Drift: foreign ExtRR owns the taint; its own cleanup will remove it.
526-
slog.WarnContext(ctx, "release taint owned by a different ExtRR; leaving in place during cleanup",
527-
"extrr", extrrObj.Name, "node", nodeName, "existing_owner", existing.Value)
528-
}
480+
if findTaintByKey(nodeToUpdate.Spec.Taints, ReleaseTaintKey) != nil {
481+
nodeToUpdate.Spec.Taints = removeTaintByKey(nodeToUpdate.Spec.Taints, ReleaseTaintKey)
482+
changed = true
529483
}
530484

531485
if _, ok := nodeToUpdate.Labels[managed.ManagedLabelKey]; ok {

janitor/pkg/controller/externalremediationrequest_controller_test.go

Lines changed: 0 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -395,41 +395,6 @@ var _ = Describe("ExternalRemediationRequest Controller resolution paths (branch
395395
"Node ResourceVersion must not advance after the cleanup PATCH settles")
396396
})
397397

398-
It("leaves a foreign taint in place when another ExtRR claims the node (drift on cleanup)", func() {
399-
nodeName := "node-true-drift-1"
400-
key := prepareReleased(ctx, r, "true-drift-extrr-1", nodeName)
401-
DeferCleanup(forceFinalizerRemovalByKey, ctx, r, key)
402-
DeferCleanup(deleteNodeForCleanup, ctx, r, nodeName)
403-
404-
// Rewrite the taint value to simulate a foreign owner.
405-
var node corev1.Node
406-
Expect(r.Client.Get(ctx, ctrlclient.ObjectKey{Name: nodeName}, &node)).To(Succeed())
407-
408-
origNode := node.DeepCopy()
409-
for i := range node.Spec.Taints {
410-
if node.Spec.Taints[i].Key == ReleaseTaintKey {
411-
node.Spec.Taints[i].Value = "foreign-err"
412-
break
413-
}
414-
}
415-
416-
Expect(r.Client.Patch(ctx, &node, ctrlclient.StrategicMergeFrom(origNode))).To(Succeed())
417-
418-
setExternalRemediationComplete(ctx, r.Client,
419-
&nvsentinelv1.ExternalRemediationRequest{ObjectMeta: metav1.ObjectMeta{
420-
Name: key.Name, Namespace: key.Namespace,
421-
}}, "True", "ExternalRemediationSucceeded")
422-
423-
_, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: key})
424-
Expect(err).NotTo(HaveOccurred())
425-
426-
Expect(r.Client.Get(ctx, ctrlclient.ObjectKey{Name: nodeName}, &node)).To(Succeed())
427-
taint := findTaintByKey(node.Spec.Taints, ReleaseTaintKey)
428-
Expect(taint).NotTo(BeNil(), "foreign taint must NOT be removed by our cleanup")
429-
Expect(taint.Value).To(Equal("foreign-err"))
430-
Expect(node.Labels).NotTo(HaveKey(managed.ManagedLabelKey),
431-
"label removal is unconditional (cluster-wide semantics)")
432-
})
433398
})
434399

435400
Context("branch 2: deletionTimestamp set (operator-driven release)", func() {
@@ -792,36 +757,6 @@ var _ = Describe("ExternalRemediationRequest Controller apply path (branch 3)",
792757

793758
// Empty-nodeName rejection is covered by the webhook test suite.
794759

795-
It("transitions to False when the Node is already tainted by a different ExtRR (drift)", func() {
796-
nodeName := "node-drift-1"
797-
// Pre-apply a taint with a DIFFERENT ExtRR's name.
798-
Expect(r.Client.Create(ctx, newTestNode(nodeName, nil,
799-
[]corev1.Taint{{Key: ReleaseTaintKey, Value: "some-other-err", Effect: corev1.TaintEffectNoSchedule}}))).
800-
To(Succeed())
801-
DeferCleanup(deleteNodeForCleanup, ctx, r, nodeName)
802-
803-
extrrObj := newTestExtRR("drift-extrr-1", nodeName)
804-
Expect(r.Client.Create(ctx, extrrObj)).To(Succeed())
805-
DeferCleanup(deleteExtRRForCleanup, ctx, r, extrrObj)
806-
807-
key := ctrlclient.ObjectKey{Name: extrrObj.Name, Namespace: extrrObj.Namespace}
808-
got := reconcileToSteadyState(ctx, r, key, 3)
809-
810-
released := findExtRRCondition(got, ConditionNVSentinelOwnershipReleased)
811-
Expect(released.Status).To(Equal("False"), "drift must transition condition to False")
812-
Expect(released.Reason).To(Equal(ReasonReleaseTaintFailed))
813-
Expect(released.Message).To(ContainSubstring("some-other-err"),
814-
"drift message must identify the existing taint owner")
815-
Expect(released.Message).To(ContainSubstring(nodeName))
816-
817-
var node corev1.Node
818-
Expect(r.Client.Get(ctx, ctrlclient.ObjectKey{Name: nodeName}, &node)).To(Succeed())
819-
taint := findTaintByKey(node.Spec.Taints, ReleaseTaintKey)
820-
Expect(taint).NotTo(BeNil())
821-
Expect(taint.Value).To(Equal("some-other-err"), "drift case must NOT overwrite the existing taint")
822-
Expect(node.Labels).NotTo(HaveKey(managed.ManagedLabelKey), "drift case must NOT set managed=false")
823-
})
824-
825760
It("requeues without acting when another maintenance resource holds the node lock", func() {
826761
nodeName := "node-locked-1"
827762
Expect(r.Client.Create(ctx, newTestNode(nodeName, nil, nil))).To(Succeed())
@@ -1087,32 +1022,6 @@ var _ = Describe("ExternalRemediationRequest Controller observability", func() {
10871022
Expect(events).To(ContainElement(ContainSubstring(eventReasonReleaseTaintApplied)))
10881023
})
10891024

1090-
It("increments released{failure} + emits ReleaseTaintFailed on drift", func() {
1091-
nodeName := "node-obs-drift-1"
1092-
// Pre-existing taint owned by a foreign ExtRR.
1093-
Expect(r.Client.Create(ctx, newTestNode(nodeName, nil,
1094-
[]corev1.Taint{{Key: ReleaseTaintKey, Value: "foreign-owner", Effect: corev1.TaintEffectNoSchedule}}))).
1095-
To(Succeed())
1096-
DeferCleanup(deleteNodeForCleanup, ctx, r, nodeName)
1097-
1098-
failureBefore := testutil.ToFloat64(janitormetrics.ExtRRTotal.WithLabelValues(
1099-
janitormetrics.ExtRRPhaseReleased, janitormetrics.ExtRRResultFailure))
1100-
1101-
extrrObj := newTestExtRR("obs-drift-1", nodeName)
1102-
Expect(r.Client.Create(ctx, extrrObj)).To(Succeed())
1103-
DeferCleanup(forceFinalizerRemoval, ctx, r, extrrObj)
1104-
1105-
key := ctrlclient.ObjectKey{Name: extrrObj.Name, Namespace: extrrObj.Namespace}
1106-
reconcileToSteadyState(ctx, r, key, 3)
1107-
1108-
Expect(testutil.ToFloat64(janitormetrics.ExtRRTotal.WithLabelValues(
1109-
janitormetrics.ExtRRPhaseReleased, janitormetrics.ExtRRResultFailure)) - failureBefore).
1110-
To(BeNumerically("==", 1.0))
1111-
1112-
events := drainEvents(r)
1113-
Expect(events).To(ContainElement(ContainSubstring(eventReasonReleaseTaintFailed)))
1114-
})
1115-
11161025
It("increments closed{success} + external_response{success} + observes age on True cleanup", func() {
11171026
nodeName := "node-obs-close-success-1"
11181027
key := prepareReleased(ctx, r, "obs-close-success-1", nodeName)

janitor/pkg/metrics/extrr_metrics.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,20 @@ const (
3030

3131
const (
3232
ExtRRResultSuccess = "success"
33-
ExtRRResultFailure = "failure"
3433
ExtRRResultOperatorDeleted = "operator_deleted"
3534
// ExtRRResultNone is the explicit sentinel for phases without an outcome.
3635
ExtRRResultNone = ""
3736
)
3837

3938
const (
4039
ExtRROpenStateAwaiting = "awaiting"
41-
ExtRROpenStateFailed = "failed"
4240
)
4341

4442
var (
4543
// ExtRRTotal phases:
4644
// created — fresh ExtRR initialised.
47-
// released — NVSentinelOwnershipReleased flipped. result=success|failure.
48-
// external_response — ExternalRemediationComplete observed. result=success|failure.
45+
// released — NVSentinelOwnershipReleased=True. result=success.
46+
// external_response — ExternalRemediationComplete=True observed. result=success.
4947
// closed — cleanup ran. result=success | operator_deleted.
5048
ExtRRTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
5149
Name: "nvsentinel_external_remediation_total",

tests/external_remediation_test.go

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -363,79 +363,6 @@ func TestExtRROperatorDeleteEscape(t *testing.T) {
363363
testEnv.Test(t, feature.Feature())
364364
}
365365

366-
// TestExtRRForeignTaintDrift: a fresh ExtRR for a Node already tainted by
367-
// another ExtRR transitions to Released=False and leaves the foreign taint.
368-
func TestExtRRForeignTaintDrift(t *testing.T) {
369-
feature := features.New("TestExtRRForeignTaintDrift").
370-
WithLabel("suite", "drift").
371-
WithLabel("component", "janitor")
372-
373-
var (
374-
nodeName string
375-
foreignOwnerCR = "extrr-foreign-owner"
376-
freshCR = "extrr-drift-victim"
377-
)
378-
379-
feature.Setup(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
380-
client, err := c.NewClient()
381-
require.NoError(t, err)
382-
383-
nodeName, err = helpers.GetRealNodeName(ctx, client)
384-
require.NoError(t, err)
385-
386-
// Land the foreign-owner taint first.
387-
_, err = helpers.CreateExtRRCR(ctx, client, foreignOwnerCR, nodeName, "foreign")
388-
require.NoError(t, err)
389-
helpers.WaitForExtRRCondition(ctx, t, client, foreignOwnerCR,
390-
"NVSentinelOwnershipReleased", "True")
391-
392-
return ctx
393-
})
394-
395-
feature.Assess("fresh ExtRR for an already-tainted node transitions to False",
396-
func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
397-
client, err := c.NewClient()
398-
require.NoError(t, err)
399-
400-
_, err = helpers.CreateExtRRCR(ctx, client, freshCR, nodeName, "victim")
401-
require.NoError(t, err)
402-
403-
got := helpers.WaitForExtRRCondition(ctx, t, client, freshCR,
404-
"NVSentinelOwnershipReleased", "False")
405-
require.NotNil(t, got)
406-
407-
cond := helpers.GetCRCondition(got, "NVSentinelOwnershipReleased")
408-
require.NotNil(t, cond)
409-
assert.Equal(t, "ReleaseTaintFailed", cond["reason"],
410-
"drift case must surface ReleaseTaintFailed reason")
411-
412-
node, err := helpers.GetNodeByName(ctx, client, nodeName)
413-
require.NoError(t, err)
414-
assertNodeHasReleaseTaint(t, node, foreignOwnerCR)
415-
416-
return ctx
417-
})
418-
419-
feature.Teardown(func(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
420-
client, err := c.NewClient()
421-
if err != nil {
422-
return ctx
423-
}
424-
425-
_ = helpers.DeleteAllCRs(ctx, t, client, helpers.ExternalRemediationRequestGVK)
426-
// Belt-and-suspenders in case the finalizer-driven cleanup didn't complete.
427-
if nodeName != "" {
428-
if err := helpers.ScrubExtRRStateFromNode(ctx, client, nodeName); err != nil {
429-
t.Logf("ScrubExtRRStateFromNode(%s): %v", nodeName, err)
430-
}
431-
}
432-
433-
return ctx
434-
})
435-
436-
testEnv.Test(t, feature.Feature())
437-
}
438-
439366
func assertNodeHasReleaseTaint(t *testing.T, node *corev1.Node, expectedOwner string) {
440367
t.Helper()
441368

0 commit comments

Comments
 (0)