NVIDIA · ayuskauskas · May 29, 2026
@@ -2,6 +2,22 @@
 
 All notable changes to this project will be documented in this file.
 
+## [Unreleased]
+
+### Bug Fixes
+
+- **Reapply-on-reboot dropped on busy nodes.** With `REAPPLY_ON_REBOOT=true`, a
+  reboot of a node under heavy controller churn (frequent pod/label/annotation
+  updates) could be detected and then silently lost: the per-node state reset
+  was persisted with a full `Update` that lost an optimistic-concurrency race,
+  yet the node's boot id was advanced anyway, marking the reboot handled. The
+  node kept its stale `complete` state and the package was never reapplied
+  (`unknown -> complete`, no pod). The reset now persists via a strategic-merge
+  `Patch` (not resourceVersion-gated, like the rest of the reconcile), and the
+  boot id is advanced only after that write succeeds, so a failed reset leaves
+  the reboot pending to be retried. Also fixes `Reset()` deleting the cordon
+  annotation with a key missing the Skyhook name.
+
 ## [operator/v0.16.1] - 2026-05-22
 
 ### Bug Fixes

@@ -628,18 +628,27 @@ func (r *SkyhookReconciler) TrackReboots(ctx context.Context, clusterState *clus
 					r.recorder.Eventf(skyhook.GetSkyhook().Skyhook, nil, EventTypeNormal, EventsReasonNodeReboot, "ResetNodeState", "detected reboot, resetting node [%s] to be reapplied", node.GetNode().Name)
 					r.recorder.Eventf(node.GetNode(), nil, EventTypeNormal, EventsReasonNodeReboot, "ResetNodeState", "detected reboot, resetting node for [%s] to be reapplied", node.GetSkyhook().Name)
 					node.Reset()
+
+					// Persist the reset before recording the new boot id. We Patch rather than
+					// Update because a busy node's resourceVersion churns constantly under other
+					// controllers, and a full Update would lose that optimistic-concurrency race; a
+					// strategic merge of only our annotation/label changes does not. And we advance
+					// NodeBootIds only after the write is durable: if the reset never lands, leaving
+					// the boot id unchanged keeps the reboot pending so it is re-detected and retried
+					// next reconcile, instead of being silently consumed while the node's stale
+					// "complete" state remains and the package is never reapplied.
+					if node.Changed() {
+						updates = true
+						patch := client.StrategicMergeFrom(clusterState.tracker.GetOriginal(node.GetNode()))
+						if err := r.Patch(ctx, node.GetNode(), patch); err != nil {
+							errs = append(errs, fmt.Errorf("error patching node after reboot [%s]: %w", node.GetNode().Name, err))
+							continue
+						}
+					}
 				}
 				skyhook.GetSkyhook().Status.NodeBootIds[node.GetNode().Name] = node.GetNode().Status.NodeInfo.BootID
 				skyhook.GetSkyhook().Updated = true
 			}
-
-			if node.Changed() { // update
-				updates = true
-				err := r.Update(ctx, node.GetNode())
-				if err != nil {
-					errs = append(errs, fmt.Errorf("error updating node after reboot [%s]: %w", node.GetNode().Name, err))
-				}
-			}
 		}
 		if skyhook.GetSkyhook().Updated { // update
 			updates = true

@@ -3431,3 +3431,124 @@ func TestHandleVersionChange_DowngradeIsNoOp(t *testing.T) {
 		g.Expect(err).To(BeNil())
 	})
 }
+
+// Reproduction for the "reapply-on-reboot loses the reboot on busy nodes" bug.
+//
+// On a node with high churn (many controllers writing labels/annotations/status), the
+// node-state reset that TrackReboots performs on reboot detection is persisted with a full,
+// optimistic-concurrency r.Update(node). That Update loses the resourceVersion race and is
+// rejected, but TrackReboots advances Status.NodeBootIds BEFORE (and independent of) that
+// write, so the reboot is "consumed" and never retried. The node keeps its stale "complete"
+// node-state annotation, and on the next reconcile derives complete -> no reapply pod.
+//
+// This spec drives the real envtest apiserver so the 409 is genuine (not injected). It
+// asserts on the in-memory clusterState object so the running manager (which reconciles real
+// CRs asynchronously) cannot race the assertion — the manager cannot touch our Go object.
+var _ = Describe("TrackReboots reapply-on-reboot on a busy node", func() {
+
+	It("must not consume the reboot when the node-state reset write loses an optimistic-concurrency race", func() {
+		const (
+			skyhookName = "reboot-conflict-sh"
+			nodeName    = "reboot-conflict-node"
+			oldBootID   = "boot-A"
+			newBootID   = "boot-B"
+		)
+		nodeLabel := map[string]string{"reboot-conflict-test": "yes"}
+
+		pkgRef := v1alpha1.PackageRef{Name: "pkg1", Version: "1.0.0"}
+		completeState := v1alpha1.NodeState{}
+		completeState.Upsert(pkgRef, "ghcr.io/org/pkg1", v1alpha1.StateComplete, v1alpha1.StageConfig, 0, "")
+		stateJSON, err := json.Marshal(completeState)
+		Expect(err).ToNot(HaveOccurred())
+		nodeStateKey := fmt.Sprintf("%s/nodeState_%s", v1alpha1.METADATA_PREFIX, skyhookName)
+
+		// Real Node in the apiserver carrying the old "complete" node-state annotation. BootID
+		// lives on the status subresource (not persisted by Create); we only need it on the
+		// in-memory snapshot below for reboot detection.
+		node := &corev1.Node{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:        nodeName,
+				Labels:      nodeLabel,
+				Annotations: map[string]string{nodeStateKey: string(stateJSON)},
+			},
+		}
+		Expect(k8sClient.Create(ctx, node)).To(Succeed())
+		DeferCleanup(func() {
+			_ = k8sClient.Delete(ctx, node)
+		})
+
+		// Persist the post-reboot BootID to the Node status (as the kubelet would). This makes
+		// TrackReboots see a reboot, and lets the reset Patch round-trip preserve the BootID so
+		// the new boot id is recorded correctly.
+		node.Status.NodeInfo.BootID = newBootID
+		Expect(k8sClient.Status().Update(ctx, node)).To(Succeed())
+
+		// The operator's in-memory snapshot, taken at this resourceVersion; the busy-node churn
+		// below advances the live Node past it.
+		snapshotNode := node.DeepCopy()
+
+		// Skyhook is kept in memory only: creating it would let the running manager reconcile
+		// it asynchronously. We assert on the in-memory clusterState, which the manager cannot
+		// reach, so the test stays deterministic.
+		skyhook := v1alpha1.Skyhook{
+			ObjectMeta: metav1.ObjectMeta{Name: skyhookName},
+			Spec: v1alpha1.SkyhookSpec{
+				NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabel},
+				Packages: v1alpha1.Packages{
+					pkgRef.Name: {PackageRef: pkgRef, Image: "ghcr.io/org/pkg1"},
+				},
+			},
+			Status: v1alpha1.SkyhookStatus{
+				NodeBootIds: map[string]string{nodeName: oldBootID},
+			},
+		}
+
+		clusterState, err := BuildState(
+			&v1alpha1.SkyhookList{Items: []v1alpha1.Skyhook{skyhook}},
+			&corev1.NodeList{Items: []corev1.Node{*snapshotNode}},
+			&v1alpha1.DeploymentPolicyList{},
+		)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(clusterState.skyhooks).To(HaveLen(1))
+		Expect(clusterState.skyhooks[0].GetNodes()).To(HaveLen(1), "node must be paired to the skyhook")
+
+		// NOISE: simulate a busy node — a concurrent controller mutates the live Node, bumping
+		// its resourceVersion. A single out-of-band write is enough: TrackReboots persists the
+		// node with a full Update, which conflicts if the live object moved at all. The 409 is
+		// the real apiserver's response, not an injected error, and is deterministic with no
+		// sleeps or polling.
+		churn := &corev1.Node{}
+		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, churn)).To(Succeed())
+		churn.Labels["busy-controller"] = "wrote-here"
+		Expect(k8sClient.Update(ctx, churn)).To(Succeed())
+
+		r := &SkyhookReconciler{
+			Client:   k8sClient,
+			recorder: operator.recorder,
+			opts:     SkyhookOperatorOptions{ReapplyOnReboot: true},
+		}
+
+		_, err = r.TrackReboots(ctx, clusterState)
+
+		// The node-state reset must NOT fail because the live Node moved under concurrent churn.
+		// A merge Patch is not resourceVersion-gated, so it lands where the old full Update lost
+		// a 409. (The skyhook-status write is expected to fail here only because this test keeps
+		// the Skyhook in memory; that is unrelated to the node write under test.)
+		if err != nil {
+			Expect(err.Error()).ToNot(ContainSubstring("node after reboot"),
+				"the node-state reset write must not fail on a busy node")
+		}
+
+		// PRIMARY: the stale "complete" node-state annotation must be cleared on the apiserver,
+		// so the package will be reapplied. On the buggy full-Update path the write is rejected
+		// and this annotation survives, so this assertion fails.
+		live := &corev1.Node{}
+		Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, live)).To(Succeed())
+		Expect(live.Annotations).ToNot(HaveKey(nodeStateKey),
+			"node still carries stale complete state after a reboot on a busy node; reapply will never happen")
+
+		// Only now that the reset is durable is the reboot recorded as handled.
+		sh := clusterState.skyhooks[0].GetSkyhook()
+		Expect(sh.Status.NodeBootIds[nodeName]).To(Equal(newBootID))
+	})
+})
@@ -489,11 +489,15 @@ func (node *skyhookNode) Reset() {
 	node.skyhook.Status.Status = v1alpha1.StatusUnknown
 	node.skyhook.Updated = true
 
-	delete(node.Annotations, fmt.Sprintf("%s/cordon_", v1alpha1.METADATA_PREFIX))
+	delete(node.Annotations, fmt.Sprintf("%s/cordon_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
 	delete(node.Annotations, fmt.Sprintf("%s/nodeState_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
 	delete(node.Annotations, fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
 
 	delete(node.Labels, fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
+
+	// We just wiped the nodeState annotation; invalidate the in-memory cache so a later
+	// State() read in this reconcile doesn't serve the stale (pre-reset) map.
+	node.nodeState = nil
 	node.updated = true
 }