Skip to content

Commit b034540

Browse files
committed
fix: batch stickiness — nodes in NodePriority finish all packages before new nodes are picked
Between packages, IntrospectNode transitions nodes from InProgress → Waiting, causing getInProgressCount() to return 0 and createNewBatch() to pick new nodes prematurely. With a Fixed 1-by-1 strategy, this meant all nodes processed in parallel instead of sequentially. GetNodesForNextBatch now checks NodePriority (via getStickyBatchNodes) before falling through to createNewBatch. Nodes already in NodePriority that aren't Complete are returned as the current sticky batch.
1 parent ed9eb69 commit b034540

File tree

3 files changed

+247
-18
lines changed

3 files changed

+247
-18
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@
66
coverage.out
77

88
# Symlinks created by Makefile build-cli
9-
k8s-tests/chainsaw/*/skyhook-cli
9+
k8s-tests/chainsaw/*/skyhook-cli
10+
docs/plans/*
11+
.claude/

operator/internal/wrapper/compartment.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
*
@@ -22,6 +22,10 @@ import (
2222
"github.com/NVIDIA/skyhook/operator/api/v1alpha1"
2323
)
2424

25+
// Compartment invariant: all nodes in a Compartment belong to the same Skyhook.
26+
// Compartments are populated per-Skyhook in BuildState and partitionNodesIntoCompartments,
27+
// so node.GetSkyhook() returns the same Skyhook for every node in the compartment.
28+
2529
func NewCompartmentWrapper(c *v1alpha1.Compartment, batchState *v1alpha1.BatchProcessingState) *Compartment {
2630
comp := &Compartment{
2731
Compartment: *c,
@@ -112,10 +116,39 @@ func (c *Compartment) GetNodesForNextBatch() []SkyhookNode {
112116
return c.getInProgressNodes()
113117
}
114118

119+
// Sticky batch: nodes in NodePriority that aren't Complete yet should
120+
// continue processing before we pick new nodes. This handles the case where
121+
// IntrospectNode transitions nodes from InProgress → Waiting between packages.
122+
if stickyNodes := c.getStickyBatchNodes(); len(stickyNodes) > 0 {
123+
return stickyNodes
124+
}
125+
115126
// No batch in progress, create a new one
116127
return c.createNewBatch()
117128
}
118129

130+
// getStickyBatchNodes returns nodes that are in NodePriority but not yet Complete.
131+
// These nodes were previously picked for a batch and should finish all their packages
132+
// before new nodes are selected.
133+
func (c *Compartment) getStickyBatchNodes() []SkyhookNode {
134+
if len(c.Nodes) == 0 {
135+
return nil
136+
}
137+
138+
skyhook := c.Nodes[0].GetSkyhook()
139+
if skyhook == nil || skyhook.Skyhook == nil || skyhook.Status.NodePriority == nil {
140+
return nil
141+
}
142+
143+
stickyNodes := make([]SkyhookNode, 0)
144+
for _, node := range c.Nodes {
145+
if _, inPriority := skyhook.Status.NodePriority[node.GetNode().Name]; inPriority && !node.IsComplete() {
146+
stickyNodes = append(stickyNodes, node)
147+
}
148+
}
149+
return stickyNodes
150+
}
151+
119152
func (c *Compartment) getInProgressNodes() []SkyhookNode {
120153
inProgressNodes := make([]SkyhookNode, 0)
121154
for _, node := range c.Nodes {

0 commit comments

Comments
 (0)