@@ -71,16 +71,17 @@ func NewController(
7171 numWorkers int ,
7272) (* controller , error ) {
7373 ctl := & controller {
74- enqueueLogger : logger .WithName (ControllerName ),
75- coreclient : coreClient ,
76- namespace : namespace ,
77- podInformer : corev1PreInformers .Pods ().Informer (),
78- podLister : corev1PreInformers .Pods ().Lister (),
79- cmInformer : corev1PreInformers .ConfigMaps ().Informer (),
80- cmLister : corev1PreInformers .ConfigMaps ().Lister (),
81- nodeInformer : corev1PreInformers .Nodes ().Informer (),
82- nodeLister : corev1PreInformers .Nodes ().Lister (),
83- requesters : make (map [string ]* requesterData ),
74+ enqueueLogger : logger .WithName (ControllerName ),
75+ coreclient : coreClient ,
76+ namespace : namespace ,
77+ podInformer : corev1PreInformers .Pods ().Informer (),
78+ podLister : corev1PreInformers .Pods ().Lister (),
79+ cmInformer : corev1PreInformers .ConfigMaps ().Informer (),
80+ cmLister : corev1PreInformers .ConfigMaps ().Lister (),
81+ nodeInformer : corev1PreInformers .Nodes ().Informer (),
82+ nodeLister : corev1PreInformers .Nodes ().Lister (),
83+ requesters : make (map [string ]* requesterData ),
84+ inferenceServers : make (map [apitypes.UID ]* serverData ),
8485 }
8586 ctl .gpuMap .Store (& map [string ]GpuLocation {})
8687 ctl .QueueAndWorkers = genctlr .NewQueueAndWorkers (string (ControllerName ), numWorkers , ctl .process )
@@ -114,8 +115,13 @@ type controller struct {
114115
115116 // requesters maps sever-requesting Pod name to data
116117 requesters map [string ]* requesterData
118+
119+ // inferenceServers maps UID of serve-requesting Pod to data
120+ inferenceServers map [apitypes.UID ]* serverData
117121}
118122
123+ var _ Controller = & controller {}
124+
119125type GpuLocation struct {
120126 Node string
121127 Index uint
@@ -126,7 +132,11 @@ type requesterData struct {
126132 GPUIndices * string
127133}
128134
129- var _ Controller = & controller {}
135+ // Internal state about an inference server
136+ type serverData struct {
137+ requestingPodName string
138+ ReadinessRelayed * bool
139+ }
130140
131141type typedRef struct {
132142 Kind string
@@ -294,25 +304,45 @@ func (ctl *controller) processConfigMap(ctx context.Context, cmRef cache.ObjectN
294304 }
295305 return err , true
296306 }
307+ oldMap := ctl .gpuMap .Load ()
297308 newMap := map [string ]GpuLocation {}
298309 nodeCount := 0
310+ additions := 0
299311 for nodeName , mapStr := range cm .Data {
300- var nodesMap map [string ]uint
301- err = json .Unmarshal ([]byte (mapStr ), & nodesMap )
312+ var newNodesMap map [string ]uint
313+ err = json .Unmarshal ([]byte (mapStr ), & newNodesMap )
302314 if err != nil {
303315 logger .Error (err , "A GPU map entry failed to parse as JSON" , "nodeName" , nodeName )
304316 continue
305317 }
306- for uuid , index := range nodesMap {
307- newMap [uuid ] = GpuLocation {Node : nodeName , Index : index }
318+ for uuid , index := range newNodesMap {
319+ newLoc := GpuLocation {Node : nodeName , Index : index }
320+ if oldMap == nil || (* oldMap )[uuid ] != newLoc {
321+ additions ++
322+ }
323+ newMap [uuid ] = newLoc
308324 }
309325 nodeCount += 1
310326 }
311- logger .V (1 ).Info ("Parsed GPU map" , "numNodes" , nodeCount , "numGPUs" , len (newMap ))
327+ logger .V (1 ).Info ("Parsed GPU map" , "numNodes" , nodeCount , "numGPUs" , len (newMap ), "additions" , additions )
312328 ctl .gpuMap .Store (& newMap )
329+ if additions > 0 {
330+ ctl .enqueueRequesters (ctx )
331+ }
313332 return nil , false
314333}
315334
335+ func (ctl * controller ) enqueueRequesters (ctx context.Context ) {
336+ ctl .mutex .Lock ()
337+ defer ctl .mutex .Unlock ()
338+ logger := klog .FromContext (ctx )
339+ for reqPodName := range ctl .requesters {
340+ ref := typedRef {Kind : podKind , ObjectName : cache.ObjectName {Namespace : ctl .namespace , Name : reqPodName }}
341+ logger .V (5 ).Info ("Enqueuing server-requesting Pod because of change to GPU map" , "ref" , ref )
342+ ctl .Queue .Add (ref )
343+ }
344+ }
345+
316346func (ctl * controller ) getRequesterData (name string , podUID apitypes.UID , insist bool ) * requesterData {
317347 ctl .mutex .Lock ()
318348 defer ctl .mutex .Unlock ()
@@ -329,3 +359,24 @@ func (ctl *controller) clearRequesterData(name string) {
329359 defer ctl .mutex .Unlock ()
330360 delete (ctl .requesters , name )
331361}
362+
363+ func (ctl * controller ) getServerData (reqName string , reqUID apitypes.UID , insist bool ) * serverData {
364+ ctl .mutex .Lock ()
365+ defer ctl .mutex .Unlock ()
366+ ans := ctl .inferenceServers [reqUID ]
367+ if ans == nil && insist {
368+ ans = & serverData {requestingPodName : reqName }
369+ ctl .inferenceServers [reqUID ] = ans
370+ }
371+ return ans
372+ }
373+
374+ func (ctl * controller ) clearServerData (reqName string ) {
375+ ctl .mutex .Lock ()
376+ defer ctl .mutex .Unlock ()
377+ for uid , serveDat := range ctl .inferenceServers {
378+ if serveDat .requestingPodName == reqName {
379+ delete (ctl .inferenceServers , uid )
380+ }
381+ }
382+ }
0 commit comments