Skip to content

Commit 51f1e17

Browse files
enoodleclaude
andcommitted
fix(scheduler): fix log messages and avoid vector mutation in comparisons
- Fix log messages in allocate.go that said "requires: %v GPUs" but now print a full resource vector - Rewrite lessEqualVectorsExcludingGPU to iterate and skip the GPU index instead of mutating input vectors via save/zero/compare/restore, which is not safe under concurrent access Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 639c4b9 commit 51f1e17

3 files changed

Lines changed: 15 additions & 14 deletions

File tree

pkg/scheduler/actions/common/allocate.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ func allocateTaskToNode(ssn *framework.Session, stmt *framework.Statement, task
174174
}
175175

176176
func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo) bool {
177-
log.InfraLogger.V(6).Infof("Binding Task <%v/%v> to node <%v>, requires: %v GPUs",
177+
log.InfraLogger.V(6).Infof("Binding Task <%v/%v> to node <%v>, requires resources: %v",
178178
task.Namespace, task.Name, node.Name, task.ResReqVector)
179179

180180
if err := stmt.Allocate(task, node.Name); err != nil {
@@ -185,7 +185,7 @@ func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod
185185
}
186186

187187
func pipelineTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, updateTasksIfExistsOnNode bool) bool {
188-
log.InfraLogger.V(6).Infof("Pipelining Task <%v/%v> to node <%v> requires: %v GPUs",
188+
log.InfraLogger.V(6).Infof("Pipelining Task <%v/%v> to node <%v>, requires resources: %v",
189189
task.Namespace, task.Name, node.Name, task.ResReqVector)
190190

191191
if err := stmt.Pipeline(task, node.Name, updateTasksIfExistsOnNode); err != nil {

pkg/scheduler/api/node_info/gpu_sharing_node_info.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ func getAcceptedTaskResourceWithoutSharedGPU(task *pod_info.PodInfo) *resource_i
7171
return requestedResourceWithoutSharedGPU
7272
}
7373

74-
func (ni *NodeInfo) addSharedTaskResources(task *pod_info.PodInfo) {
74+
func (ni *NodeInfo) addSharedGPUTaskResources(task *pod_info.PodInfo) {
7575
if !task.IsSharedGPUAllocation() {
7676
return
7777
}
@@ -80,14 +80,14 @@ func (ni *NodeInfo) addSharedTaskResources(task *pod_info.PodInfo) {
8080
task.Namespace, task.Name, task.Status, ni)
8181

8282
for _, gpuGroup := range task.GPUGroups {
83-
ni.addSharedTaskResourcesPerPodGroup(task, gpuGroup)
83+
ni.addSharedGPUTaskResourcesPerPodGroup(task, gpuGroup)
8484
}
8585

8686
log.InfraLogger.V(8).Infof("Added shared podsInfo: <%v/%v>, status: <%v>, node: <%+v>",
8787
task.Namespace, task.Name, task.Status, ni)
8888
}
8989

90-
func (ni *NodeInfo) addSharedTaskResourcesPerPodGroup(task *pod_info.PodInfo, gpuGroup string) {
90+
func (ni *NodeInfo) addSharedGPUTaskResourcesPerPodGroup(task *pod_info.PodInfo, gpuGroup string) {
9191
log.InfraLogger.V(7).Infof(
9292
"About to add shared podsInfo: <%v/%v>, gpuGroup: <%v> "+
9393
"releasingSharedGPU: <%v> AllocatedSharedGPUsMemory <%v>, UsedSharedGPUsMemory: <%v>",

pkg/scheduler/api/node_info/node_info.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -393,14 +393,15 @@ func (ni *NodeInfo) isTaskAllocatableOnNonAllocatedResources(
393393

394394
func (ni *NodeInfo) lessEqualVectorsExcludingGPU(a, b resource_info.ResourceVector) bool {
395395
gpuIdx := ni.VectorMap.GetIndex(commonconstants.GpuResource)
396-
savedA := a.Get(gpuIdx)
397-
savedB := b.Get(gpuIdx)
398-
a.Set(gpuIdx, 0)
399-
b.Set(gpuIdx, 0)
400-
result := a.LessEqual(b)
401-
a.Set(gpuIdx, savedA)
402-
b.Set(gpuIdx, savedB)
403-
return result
396+
for i := 0; i < len(a); i++ {
397+
if i == gpuIdx {
398+
continue
399+
}
400+
if a.Get(i) > b.Get(i) {
401+
return false
402+
}
403+
}
404+
return true
404405
}
405406

406407
func (ni *NodeInfo) AddTask(task *pod_info.PodInfo) error {
@@ -508,7 +509,7 @@ func (ni *NodeInfo) addTaskResources(task *pod_info.PodInfo) {
508509
ni.IdleVector.Sub(resourcesToTrackVector)
509510
}
510511

511-
ni.addSharedTaskResources(task)
512+
ni.addSharedGPUTaskResources(task)
512513

513514
log.InfraLogger.V(8).Infof("Added podsInfo: <%v/%v>, status: <%v>, node: <%+v>",
514515
task.Namespace, task.Name, task.Status, ni)

0 commit comments

Comments
 (0)