Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions operator/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@

All notable changes to this project will be documented in this file.

## [Unreleased]

### Bug Fixes

- **Reapply-on-reboot dropped on busy nodes.** With `REAPPLY_ON_REBOOT=true`, a
reboot of a node under heavy controller churn (frequent pod/label/annotation
updates) could be detected and then silently lost: the per-node state reset
was persisted with a full `Update` that lost an optimistic-concurrency race,
yet the node's boot id was advanced anyway, marking the reboot handled. The
node kept its stale `complete` state and the package was never reapplied
(`unknown -> complete`, no pod). The reset now persists via a strategic-merge
`Patch` (not resourceVersion-gated, like the rest of the reconcile), and the
boot id is advanced only after that write succeeds, so a failed reset leaves
the reboot pending to be retried. Also fixes `Reset()` deleting the cordon
annotation with a key missing the Skyhook name.

## [operator/v0.16.1] - 2026-05-22

### Bug Fixes
Expand Down
25 changes: 17 additions & 8 deletions operator/internal/controller/skyhook_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -628,18 +628,27 @@ func (r *SkyhookReconciler) TrackReboots(ctx context.Context, clusterState *clus
r.recorder.Eventf(skyhook.GetSkyhook().Skyhook, nil, EventTypeNormal, EventsReasonNodeReboot, "ResetNodeState", "detected reboot, resetting node [%s] to be reapplied", node.GetNode().Name)
r.recorder.Eventf(node.GetNode(), nil, EventTypeNormal, EventsReasonNodeReboot, "ResetNodeState", "detected reboot, resetting node for [%s] to be reapplied", node.GetSkyhook().Name)
node.Reset()

// Persist the reset before recording the new boot id. We Patch rather than
// Update because a busy node's resourceVersion churns constantly under other
// controllers, and a full Update would lose that optimistic-concurrency race; a
// strategic merge of only our annotation/label changes does not. And we advance
// NodeBootIds only after the write is durable: if the reset never lands, leaving
// the boot id unchanged keeps the reboot pending so it is re-detected and retried
// next reconcile, instead of being silently consumed while the node's stale
// "complete" state remains and the package is never reapplied.
if node.Changed() {
updates = true
patch := client.StrategicMergeFrom(clusterState.tracker.GetOriginal(node.GetNode()))
if err := r.Patch(ctx, node.GetNode(), patch); err != nil {
errs = append(errs, fmt.Errorf("error patching node after reboot [%s]: %w", node.GetNode().Name, err))
continue
}
}
}
skyhook.GetSkyhook().Status.NodeBootIds[node.GetNode().Name] = node.GetNode().Status.NodeInfo.BootID
skyhook.GetSkyhook().Updated = true
}

if node.Changed() { // update
updates = true
err := r.Update(ctx, node.GetNode())
if err != nil {
errs = append(errs, fmt.Errorf("error updating node after reboot [%s]: %w", node.GetNode().Name, err))
}
}
}
if skyhook.GetSkyhook().Updated { // update
updates = true
Expand Down
121 changes: 121 additions & 0 deletions operator/internal/controller/skyhook_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3431,3 +3431,124 @@ func TestHandleVersionChange_DowngradeIsNoOp(t *testing.T) {
g.Expect(err).To(BeNil())
})
}

// Reproduction for the "reapply-on-reboot loses the reboot on busy nodes" bug.
//
// On a node with high churn (many controllers writing labels/annotations/status), the
// node-state reset that TrackReboots performs on reboot detection is persisted with a full,
// optimistic-concurrency r.Update(node). That Update loses the resourceVersion race and is
// rejected, but TrackReboots advances Status.NodeBootIds BEFORE (and independent of) that
// write, so the reboot is "consumed" and never retried. The node keeps its stale "complete"
// node-state annotation, and on the next reconcile derives complete -> no reapply pod.
//
// This spec drives the real envtest apiserver so the 409 is genuine (not injected). It
// asserts on the in-memory clusterState object so the running manager (which reconciles real
// CRs asynchronously) cannot race the assertion — the manager cannot touch our Go object.
var _ = Describe("TrackReboots reapply-on-reboot on a busy node", func() {

It("must not consume the reboot when the node-state reset write loses an optimistic-concurrency race", func() {
const (
skyhookName = "reboot-conflict-sh"
nodeName = "reboot-conflict-node"
oldBootID = "boot-A"
newBootID = "boot-B"
)
nodeLabel := map[string]string{"reboot-conflict-test": "yes"}

pkgRef := v1alpha1.PackageRef{Name: "pkg1", Version: "1.0.0"}
completeState := v1alpha1.NodeState{}
completeState.Upsert(pkgRef, "ghcr.io/org/pkg1", v1alpha1.StateComplete, v1alpha1.StageConfig, 0, "")
stateJSON, err := json.Marshal(completeState)
Expect(err).ToNot(HaveOccurred())
nodeStateKey := fmt.Sprintf("%s/nodeState_%s", v1alpha1.METADATA_PREFIX, skyhookName)

// Real Node in the apiserver carrying the old "complete" node-state annotation. BootID
// lives on the status subresource (not persisted by Create); we only need it on the
// in-memory snapshot below for reboot detection.
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: nodeLabel,
Annotations: map[string]string{nodeStateKey: string(stateJSON)},
},
}
Expect(k8sClient.Create(ctx, node)).To(Succeed())
DeferCleanup(func() {
_ = k8sClient.Delete(ctx, node)
})

// Persist the post-reboot BootID to the Node status (as the kubelet would). This makes
// TrackReboots see a reboot, and lets the reset Patch round-trip preserve the BootID so
// the new boot id is recorded correctly.
node.Status.NodeInfo.BootID = newBootID
Expect(k8sClient.Status().Update(ctx, node)).To(Succeed())

// The operator's in-memory snapshot, taken at this resourceVersion; the busy-node churn
// below advances the live Node past it.
snapshotNode := node.DeepCopy()

// Skyhook is kept in memory only: creating it would let the running manager reconcile
// it asynchronously. We assert on the in-memory clusterState, which the manager cannot
// reach, so the test stays deterministic.
skyhook := v1alpha1.Skyhook{
ObjectMeta: metav1.ObjectMeta{Name: skyhookName},
Spec: v1alpha1.SkyhookSpec{
NodeSelector: metav1.LabelSelector{MatchLabels: nodeLabel},
Packages: v1alpha1.Packages{
pkgRef.Name: {PackageRef: pkgRef, Image: "ghcr.io/org/pkg1"},
},
},
Status: v1alpha1.SkyhookStatus{
NodeBootIds: map[string]string{nodeName: oldBootID},
},
}

clusterState, err := BuildState(
&v1alpha1.SkyhookList{Items: []v1alpha1.Skyhook{skyhook}},
&corev1.NodeList{Items: []corev1.Node{*snapshotNode}},
&v1alpha1.DeploymentPolicyList{},
)
Expect(err).ToNot(HaveOccurred())
Expect(clusterState.skyhooks).To(HaveLen(1))
Expect(clusterState.skyhooks[0].GetNodes()).To(HaveLen(1), "node must be paired to the skyhook")

// NOISE: simulate a busy node — a concurrent controller mutates the live Node, bumping
// its resourceVersion. A single out-of-band write is enough: TrackReboots persists the
// node with a full Update, which conflicts if the live object moved at all. The 409 is
// the real apiserver's response, not an injected error, and is deterministic with no
// sleeps or polling.
churn := &corev1.Node{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, churn)).To(Succeed())
churn.Labels["busy-controller"] = "wrote-here"
Expect(k8sClient.Update(ctx, churn)).To(Succeed())

r := &SkyhookReconciler{
Client: k8sClient,
recorder: operator.recorder,
opts: SkyhookOperatorOptions{ReapplyOnReboot: true},
}

_, err = r.TrackReboots(ctx, clusterState)

// The node-state reset must NOT fail because the live Node moved under concurrent churn.
// A merge Patch is not resourceVersion-gated, so it lands where the old full Update lost
// a 409. (The skyhook-status write is expected to fail here only because this test keeps
// the Skyhook in memory; that is unrelated to the node write under test.)
if err != nil {
Expect(err.Error()).ToNot(ContainSubstring("node after reboot"),
"the node-state reset write must not fail on a busy node")
}

// PRIMARY: the stale "complete" node-state annotation must be cleared on the apiserver,
// so the package will be reapplied. On the buggy full-Update path the write is rejected
// and this annotation survives, so this assertion fails.
live := &corev1.Node{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, live)).To(Succeed())
Expect(live.Annotations).ToNot(HaveKey(nodeStateKey),
"node still carries stale complete state after a reboot on a busy node; reapply will never happen")

// Only now that the reset is durable is the reboot recorded as handled.
sh := clusterState.skyhooks[0].GetSkyhook()
Expect(sh.Status.NodeBootIds[nodeName]).To(Equal(newBootID))
})
})
6 changes: 5 additions & 1 deletion operator/internal/wrapper/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,15 @@ func (node *skyhookNode) Reset() {
node.skyhook.Status.Status = v1alpha1.StatusUnknown
node.skyhook.Updated = true

delete(node.Annotations, fmt.Sprintf("%s/cordon_", v1alpha1.METADATA_PREFIX))
delete(node.Annotations, fmt.Sprintf("%s/cordon_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
delete(node.Annotations, fmt.Sprintf("%s/nodeState_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))
delete(node.Annotations, fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))

delete(node.Labels, fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhook.Name))

// We just wiped the nodeState annotation; invalidate the in-memory cache so a later
// State() read in this reconcile doesn't serve the stale (pre-reset) map.
node.nodeState = nil
node.updated = true
}

Expand Down
Loading