Skip to content

Commit 50340ff

Browse files
committed
Reconsider requesters after GPU map update
Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
1 parent 68d2c69 commit 50340ff

File tree

1 file changed

+25
-5
lines changed

1 file changed

+25
-5
lines changed

pkg/controller/dual-pods/controller.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -304,25 +304,45 @@ func (ctl *controller) processConfigMap(ctx context.Context, cmRef cache.ObjectN
304304
}
305305
return err, true
306306
}
307+
oldMap := ctl.gpuMap.Load()
307308
newMap := map[string]GpuLocation{}
308309
nodeCount := 0
310+
additions := 0
309311
for nodeName, mapStr := range cm.Data {
310-
var nodesMap map[string]uint
311-
err = json.Unmarshal([]byte(mapStr), &nodesMap)
312+
var newNodesMap map[string]uint
313+
err = json.Unmarshal([]byte(mapStr), &newNodesMap)
312314
if err != nil {
313315
logger.Error(err, "A GPU map entry failed to parse as JSON", "nodeName", nodeName)
314316
continue
315317
}
316-
for uuid, index := range nodesMap {
317-
newMap[uuid] = GpuLocation{Node: nodeName, Index: index}
318+
for uuid, index := range newNodesMap {
319+
newLoc := GpuLocation{Node: nodeName, Index: index}
320+
if oldMap == nil || (*oldMap)[uuid] != newLoc {
321+
additions++
322+
}
323+
newMap[uuid] = newLoc
318324
}
319325
nodeCount += 1
320326
}
321-
logger.V(1).Info("Parsed GPU map", "numNodes", nodeCount, "numGPUs", len(newMap))
327+
logger.V(1).Info("Parsed GPU map", "numNodes", nodeCount, "numGPUs", len(newMap), "additions", additions)
322328
ctl.gpuMap.Store(&newMap)
329+
if additions > 0 {
330+
ctl.enqueueRequesters(ctx)
331+
}
323332
return nil, false
324333
}
325334

335+
func (ctl *controller) enqueueRequesters(ctx context.Context) {
336+
ctl.mutex.Lock()
337+
defer ctl.mutex.Unlock()
338+
logger := klog.FromContext(ctx)
339+
for reqPodName := range ctl.requesters {
340+
ref := typedRef{Kind: podKind, ObjectName: cache.ObjectName{Namespace: ctl.namespace, Name: reqPodName}}
341+
logger.V(5).Info("Enqueuing server-requesting Pod because of change to GPU map", "ref", ref)
342+
ctl.Queue.Add(ref)
343+
}
344+
}
345+
326346
func (ctl *controller) getRequesterData(name string, podUID apitypes.UID, insist bool) *requesterData {
327347
ctl.mutex.Lock()
328348
defer ctl.mutex.Unlock()

0 commit comments

Comments
 (0)