@@ -178,6 +178,7 @@ func (config ControllerConfig) NewController(
178178 inferenceServerConfigIndexName : inferenceServerConfigIndexFunc ,
179179 launcherConfigHashIndexName : launcherConfigHashIndexFunc ,
180180 requesterIndexName : requesterIndexFunc ,
181+ nodeNameIndexName : nodeNameIndexFunc ,
181182 nominalHashIndexName : nominalHashIndexFunc ,
182183 GPUIndexName : GPUIndexFunc })
183184 if err != nil { //impossible
@@ -258,7 +259,7 @@ type nodeData struct {
258259 InferenceServers map [apitypes.UID ]* serverData
259260
260261 // Launchers maps name of launcher-based server-providing Pod to launcherData.
261- // Access only while holding controller mutex .
262+ // Access only inside the calling hierarchy that `nodeItem.process()` is the root caller .
262263 Launchers map [string ]* launcherData
263264
264265 // ItemsMutex may be acquired while holding controller mutex, not vice-versa.
@@ -330,34 +331,47 @@ type infSvrItem struct {
330331 RequesterName string
331332}
332333
334+ type launcherPodItem struct {
335+ LauncherPodName string
336+ NodeName string
337+ }
338+
333339type infSvrItemType string
334340
335341const (
336342 // infSvrItemRequester is for a server-requesting Pod.
337343 infSvrItemRequester infSvrItemType = "requester"
338- // infSvrItemBoundDirectProvider is for a server-providing Pod that
339- // is 'direct' (i.e. not launcher-based), and bound to a server-requesting Pod.
340- infSvrItemBoundDirectProvider infSvrItemType = "bound_direct_provider"
341- // infSvrItemLauncherBasedProvider is for a server-providing Pod that is launcher-based.
342- infSvrItemLauncherBasedProvider infSvrItemType = "launcher_based_provider"
344+ // infSvrItemBoundProvider is for a server-providing Pod that
345+ // is bound to a server-requesting Pod.
346+ infSvrItemBoundProvider infSvrItemType = "bound_provider"
347+ // infSvrItemUnboundLauncherBasedProvider is for a server-providing Pod that
348+ // is launcher-based and not bound to any server-requesting Pods.
349+ infSvrItemUnboundLauncherBasedProvider infSvrItemType = "unbound_launcher_based_provider"
343350 // infSvrItemDontCare is not a real infSvrItemType but only a placeholder
344351 // saying the corresponding infSvrItem is not relevant to the controller.
345352 infSvrItemDontCare infSvrItemType = "dont_care"
346353)
347354
348355// careAbout returns an infSvrItem and an infSvrItemType.
349- // The controller cares about server-requesting Pods, bound direct server-providing Pods, and launcher-based server-providing Pods.
356+ // The controller cares about
357+ // - server-requesting Pods,
358+ // - bound server-providing Pods,
359+ // - unbound launcher-based server-providing Pods.
350360// The controller doesn't care about unbound direct providers and other Pods.
351361func careAbout (pod * corev1.Pod ) (item infSvrItem , it infSvrItemType ) {
352362 if len (pod .Annotations [api .ServerPatchAnnotationName ]) > 0 || len (pod .Annotations [api .InferenceServerConfigAnnotationName ]) > 0 {
353363 return infSvrItem {pod .UID , pod .Name }, infSvrItemRequester
354364 }
355365 requesterStr := pod .Annotations [requesterAnnotationKey ]
356366 requesterParts := strings .Split (requesterStr , " " )
357- if len (requesterParts ) ! = 2 {
358- return infSvrItem {}, infSvrItemDontCare
367+ if len (requesterParts ) = = 2 {
368+ return infSvrItem {apitypes . UID ( requesterParts [ 0 ]), requesterParts [ 1 ] }, infSvrItemBoundProvider
359369 }
360- return infSvrItem {apitypes .UID (requesterParts [0 ]), requesterParts [1 ]}, infSvrItemBoundDirectProvider
370+ if _ , hasLauncherLabel := pod .Labels [ctlrcommon .LauncherConfigNameLabelKey ]; hasLauncherLabel {
371+ // For an unbound launcher-based server-providing Pod, use the Pod's own UID and name
372+ return infSvrItem {pod .UID , pod .Name }, infSvrItemUnboundLauncherBasedProvider
373+ }
374+ return infSvrItem {}, infSvrItemDontCare
361375}
362376
363377const inferenceServerConfigIndexName = "inferenceserverconfig"
@@ -387,21 +401,43 @@ const requesterIndexName = "requester"
387401func requesterIndexFunc (obj any ) ([]string , error ) {
388402 pod := obj .(* corev1.Pod )
389403 item , it := careAbout (pod )
390- if it == infSvrItemBoundDirectProvider {
404+ if it == infSvrItemBoundProvider {
391405 return []string {string (item .UID )}, nil
392406 }
393407 return []string {}, nil
394408}
395409
410+ const nodeNameIndexName = "nodeName"
411+
412+ func nodeNameIndexFunc (obj any ) ([]string , error ) {
413+ pod := obj .(* corev1.Pod )
414+ if pod .Spec .NodeName == "" {
415+ return []string {}, nil
416+ }
417+ return []string {pod .Spec .NodeName }, nil
418+ }
419+
396420func (ctl * controller ) OnAdd (obj any , isInInitialList bool ) {
397421 switch typed := obj .(type ) {
398422 case * corev1.Pod :
399423 if item , it := careAbout (typed ); it == infSvrItemDontCare {
400424 ctl .enqueueLogger .V (5 ).Info ("Ignoring add of irrelevant Pod" , "name" , typed .Name )
401425 return
426+ } else if it == infSvrItemUnboundLauncherBasedProvider {
427+ nodeName , err := getProviderNodeName (typed )
428+ if err != nil {
429+ ctl .enqueueLogger .Error (err , "Failed to determine node of launcher" )
430+ return
431+ }
432+ nd := ctl .getNodeData (nodeName )
433+ launcherPodItem := launcherPodItem {LauncherPodName : typed .Name , NodeName : nodeName }
434+ ctl .enqueueLogger .V (5 ).Info ("Enqueuing launcher reference due to notification of add" ,
435+ "nodeName" , nodeName , "launcherPod" , typed .Name , "isInInitialList" , isInInitialList , "resourceVersion" , typed .ResourceVersion )
436+ nd .add (launcherPodItem )
437+ ctl .Queue .Add (nodeItem {nodeName })
402438 } else {
403439 nodeName := typed .Spec .NodeName
404- if it == infSvrItemBoundDirectProvider || it == infSvrItemLauncherBasedProvider {
440+ if it == infSvrItemBoundProvider {
405441 var err error
406442 nodeName , err = getProviderNodeName (typed )
407443 if err != nil {
@@ -440,9 +476,21 @@ func (ctl *controller) OnUpdate(prev, obj any) {
440476 if item , it := careAbout (typed ); it == infSvrItemDontCare {
441477 ctl .enqueueLogger .V (5 ).Info ("Ignoring update of irrelevant Pod" , "name" , typed .Name )
442478 return
479+ } else if it == infSvrItemUnboundLauncherBasedProvider {
480+ nodeName , err := getProviderNodeName (typed )
481+ if err != nil {
482+ ctl .enqueueLogger .Error (err , "Failed to determine node of launcher" )
483+ return
484+ }
485+ nd := ctl .getNodeData (nodeName )
486+ launcherPodItem := launcherPodItem {LauncherPodName : typed .Name , NodeName : nodeName }
487+ ctl .enqueueLogger .V (5 ).Info ("Enqueuing launcher reference due to notification of update" ,
488+ "nodeName" , nodeName , "launcherPod" , typed .Name , "resourceVersion" , typed .ResourceVersion )
489+ nd .add (launcherPodItem )
490+ ctl .Queue .Add (nodeItem {nodeName })
443491 } else {
444492 nodeName := typed .Spec .NodeName
445- if it == infSvrItemBoundDirectProvider || it == infSvrItemLauncherBasedProvider {
493+ if it == infSvrItemBoundProvider {
446494 var err error
447495 nodeName , err = getProviderNodeName (typed )
448496 if err != nil {
@@ -484,9 +532,21 @@ func (ctl *controller) OnDelete(obj any) {
484532 if item , it := careAbout (typed ); it == infSvrItemDontCare {
485533 ctl .enqueueLogger .V (5 ).Info ("Ignoring delete of irrelevant Pod" , "name" , typed .Name )
486534 return
535+ } else if it == infSvrItemUnboundLauncherBasedProvider {
536+ nodeName , err := getProviderNodeName (typed )
537+ if err != nil {
538+ ctl .enqueueLogger .Error (err , "Failed to determine node of launcher" )
539+ return
540+ }
541+ nd := ctl .getNodeData (nodeName )
542+ launcherPodItem := launcherPodItem {LauncherPodName : typed .Name , NodeName : nodeName }
543+ ctl .enqueueLogger .V (5 ).Info ("Enqueuing launcher reference due to notification of delete" ,
544+ "nodeName" , nodeName , "launcherPod" , typed .Name , "resourceVersion" , typed .ResourceVersion )
545+ nd .add (launcherPodItem )
546+ ctl .Queue .Add (nodeItem {nodeName })
487547 } else {
488548 nodeName := typed .Spec .NodeName
489- if it == infSvrItemBoundDirectProvider || it == infSvrItemLauncherBasedProvider {
549+ if it == infSvrItemBoundProvider {
490550 var err error
491551 nodeName , err = getProviderNodeName (typed )
492552 if err != nil {
@@ -630,6 +690,40 @@ func (ctl *controller) enqueueRequestersByInferenceServerConfig(isc *fmav1alpha1
630690 }
631691}
632692
693+ func (ctl * controller ) enqueueUnboundInfSvrItemsOnNode (ctx context.Context , nodeName string , whyEnqueue string ) {
694+ logger := klog .FromContext (ctx )
695+ nd := ctl .getNodeData (nodeName )
696+ itemCount := 0
697+ podObjs , err := ctl .podInformer .GetIndexer ().ByIndex (nodeNameIndexName , nodeName )
698+ if err != nil {
699+ logger .Error (err , "Failed to list Pods by nodeName index" , "nodeName" , nodeName , "whyEnqueue" , whyEnqueue )
700+ return
701+ }
702+ for _ , podObj := range podObjs {
703+ pod := podObj .(* corev1.Pod )
704+ item , it := careAbout (pod )
705+ if it != infSvrItemRequester {
706+ continue
707+ }
708+ // skip bound Inference Servers
709+ // a podObj could be either a server-requesting Pod or a server-providing Pod
710+ // but after the `it != infSvrItemRequester` check above, it must be a server-requesting Pod here, and we want to skip it if it's bound to a server-providing Pod
711+ // we can use the controller's data to check whether it's bound or not
712+ serverDat := ctl .getServerData (nd , pod .Name , pod .UID )
713+ if serverDat .ProvidingPodName != "" {
714+ continue
715+ }
716+ nd .add (item )
717+ itemCount ++
718+ }
719+ if itemCount == 0 {
720+ logger .V (5 ).Info ("No unbound infSvrItems to enqueue on node" , "node" , nodeName , "whyEnqueue" , whyEnqueue )
721+ return
722+ }
723+ logger .V (5 ).Info ("Enqueuing unbound infSvrItems on node" , "node" , nodeName , "whyEnqueue" , whyEnqueue , "itemCount" , itemCount )
724+ ctl .Queue .Add (nodeItem {nodeName })
725+ }
726+
633727func (ctl * controller ) getNodeData (nodeName string ) * nodeData {
634728 ctl .mutex .Lock ()
635729 defer ctl .mutex .Unlock ()
@@ -673,8 +767,6 @@ func (ctl *controller) getServerData(nodeDat *nodeData, reqName string, reqUID a
673767}
674768
675769func (ctl * controller ) getLauncherData (nodeDat * nodeData , launcherPodName string ) * launcherData {
676- ctl .mutex .Lock ()
677- defer ctl .mutex .Unlock ()
678770 ans := nodeDat .Launchers [launcherPodName ]
679771 if ans == nil {
680772 ans = & launcherData {
@@ -685,6 +777,10 @@ func (ctl *controller) getLauncherData(nodeDat *nodeData, launcherPodName string
685777 return ans
686778}
687779
780+ func (ctl * controller ) clearLauncherData (nodeDat * nodeData , launcherPodName string ) {
781+ delete (nodeDat .Launchers , launcherPodName )
782+ }
783+
688784func (ctl * controller ) clearServerData (nodeDat * nodeData , uid apitypes.UID ) {
689785 ctl .mutex .Lock ()
690786 defer ctl .mutex .Unlock ()
0 commit comments