@@ -57,6 +57,12 @@ type nodeItem struct {
5757 NodeName string
5858}
5959
60+ type launcherSyncResult struct {
61+ instances * AllInstancesState
62+ deletedStoppedInstanceIDs sets.Set [string ]
63+ failedStoppedInstanceErrs map [string ]error
64+ }
65+
6066func (ni nodeItem ) process (ctx context.Context , ctl * controller ) (error , bool ) {
6167 logger := klog .FromContext (ctx ).WithValues ("node" , ni .NodeName )
6268 ctx = klog .NewContext (ctx , logger )
@@ -85,11 +91,11 @@ func (ni nodeItem) process(ctx context.Context, ctl *controller) (error, bool) {
8591 return nil , retries > 0
8692}
8793
88- func (item launcherPodItem ) process (ctx context.Context , ctl * controller , nodeDat * nodeData ) (error , bool ) {
94+ func (item unboundLauncherPodItem ) process (ctx context.Context , ctl * controller , nodeDat * nodeData ) (error , bool ) {
8995 logger := klog .FromContext (ctx ).WithValues ("launcherPod" , item .LauncherPodName , "node" , item .NodeName )
9096 ctx = klog .NewContext (ctx , logger )
9197
92- _ , err := ctl .podLister .Pods (ctl .namespace ).Get (item .LauncherPodName )
98+ launcherPod , err := ctl .podLister .Pods (ctl .namespace ).Get (item .LauncherPodName )
9399 if err != nil {
94100 if apierrors .IsNotFound (err ) {
95101 logger .V (2 ).Info ("Launcher pod deleted, cleaning up launcher data" )
@@ -100,8 +106,15 @@ func (item launcherPodItem) process(ctx context.Context, ctl *controller, nodeDa
100106 return err , true
101107 }
102108
109+ // Sync launcher instances to keep internal state fresh and clean up stopped instances.
110+ _ , syncErr , syncRetry := ctl .syncLauncherInstances (ctx , nodeDat , launcherPod )
111+
103112 ctl .enqueueUnboundInfSvrItemsOnNode (ctx , item .NodeName , fmt .Sprintf ("launcher pod %s changed" , item .LauncherPodName ))
104- return nil , false
113+
114+ if syncErr != nil {
115+ return fmt .Errorf ("failed to sync launcher instances: %w" , syncErr ), syncRetry
116+ }
117+ return nil , syncRetry
105118}
106119
107120func (item infSvrItem ) process (urCtx context.Context , ctl * controller , nodeDat * nodeData ) (error , bool ) {
@@ -350,6 +363,51 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
350363 return fmt .Errorf ("unable to wake up server because port not known: %w" , err ), true
351364 }
352365 }
366+ // For launcher-based providers, check whether the bound instance is still alive.
367+ // The sidecar notifier updates the Pod annotation when instance status changes,
368+ // which triggers reconciliation through the informer.
369+ if launcherBased && serverDat .InstanceID != "" && providingPod .Status .PodIP != "" {
370+ syncResult , err , retry := ctl .syncLauncherInstances (ctx , nodeDat , providingPod )
371+ if err != nil || retry {
372+ if err != nil {
373+ return fmt .Errorf ("failed to sync launcher instances for bound launcher Pod: %w" , err ), retry
374+ }
375+ return nil , true
376+ }
377+
378+ _ , instancePresent := findInstanceState (syncResult .instances .Instances , serverDat .InstanceID )
379+ if delErr , failedCleanup := syncResult .failedStoppedInstanceErrs [serverDat .InstanceID ]; failedCleanup {
380+ return fmt .Errorf ("failed to delete stopped instance %q from launcher: %w" , serverDat .InstanceID , delErr ), true
381+ }
382+ if _ , deletedStopped := syncResult .deletedStoppedInstanceIDs [serverDat .InstanceID ]; deletedStopped || ! instancePresent {
383+ if deletedStopped {
384+ logger .V (2 ).Info ("Deleted stopped bound instance from launcher during sync" )
385+ } else {
386+ logger .V (2 ).Info ("Bound instance not found in launcher after sync, treating as deleted" )
387+ }
388+ // Mark as sleeping so that ensureUnbound (called during requester deletion)
389+ // does not attempt to POST /sleep on the dead instance.
390+ // The instance process is dead — this is not a real sleeping state,
391+ // but it prevents ensureUnbound from hitting a dead endpoint and retrying forever.
392+ serverDat .Sleeping = ptr .To (true )
393+ // Delete the server-requesting Pod.
394+ // This is analogous to the direct-provider case where a providing Pod's
395+ // deletion is reflected to deletion of the server-requesting Pod.
396+ // The ReplicaSet will recreate the requesting Pod, triggering a fresh bind.
397+ err = podOps .Delete (ctx , requestingPod .Name , metav1.DeleteOptions {
398+ PropagationPolicy : ptr .To (metav1 .DeletePropagationBackground ),
399+ Preconditions : & metav1.Preconditions {UID : & item .UID , ResourceVersion : & requestingPod .ResourceVersion }})
400+ if err == nil {
401+ logger .V (2 ).Info ("Requested deletion of server-requesting Pod because bound instance stopped" )
402+ } else if apierrors .IsGone (err ) || apierrors .IsNotFound (err ) {
403+ logger .V (5 ).Info ("The server-requesting Pod is already gone" )
404+ } else {
405+ return fmt .Errorf ("failed to delete server-requesting Pod for stopped instance: %w" , err ), true
406+ }
407+ serverDat .RequesterDeleteRequested = true
408+ return nil , false
409+ }
410+ }
353411 if serverDat .Sleeping == nil {
354412 sleeping , err := ctl .querySleeping (ctx , providingPod , serverPort )
355413 if err != nil {
@@ -469,7 +527,6 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
469527 lcName := isc .Spec .LauncherConfigName
470528 lc , err := ctl .lcLister .LauncherConfigs (ctl .namespace ).Get (lcName )
471529 if err != nil {
472- // TODO(waltforme): introduce the 'enqueue requesters by launcherconfigs' logic to the controller
473530 return ctl .ensureReqStatus (ctx , requestingPod , serverDat ,
474531 fmt .Sprintf ("failed to get LauncherConfig %q: %v" , lcName , err ),
475532 )
@@ -561,7 +618,6 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
561618 }
562619 // Sleeper budget is met. Make a new launcher Pod.
563620
564- // TODO(waltforme): introduce the 'enqueue requesters by launcher pods' logic to the controller.
565621 echo , err := podOps .Create (ctx , desiredLauncherPod , metav1.CreateOptions {})
566622 if err != nil {
567623 errMsg := err .Error ()
@@ -621,11 +677,12 @@ func (ctl *controller) selectBestLauncherPod(
621677 continue
622678 }
623679
624- insts , err , retry := ctl .syncLauncherInstances (ctx , nodeDat , launcherPod )
680+ syncResult , err , retry := ctl .syncLauncherInstances (ctx , nodeDat , launcherPod )
625681 if err != nil || retry {
626682 somePodsNotReady = true
627683 continue
628684 }
685+ insts := syncResult .instances
629686
630687 // Check if this launcher has a sleeping instance matching the iscHash
631688 hasSleepingInstance := false
@@ -650,7 +707,7 @@ func (ctl *controller) selectBestLauncherPod(
650707 hasPortConflict = true
651708 break
652709 }
653- if inst .InstanceID == iscHash {
710+ if inst .InstanceID == iscHash && inst . Status != InstanceStatusStopped {
654711 hasSleepingInstance = true
655712 }
656713 }
@@ -1324,10 +1381,19 @@ var coreScheme *k8sruntime.Scheme
13241381var codecFactory k8sserializer.CodecFactory
13251382var podDecoder k8sruntime.Decoder
13261383
1384+ func findInstanceState (insts []InstanceState , instanceID string ) (* InstanceState , bool ) {
1385+ for idx := range insts {
1386+ if insts [idx ].InstanceID == instanceID {
1387+ return & insts [idx ], true
1388+ }
1389+ }
1390+ return nil , false
1391+ }
1392+
13271393// syncLauncherInstances queries the launcher pod for its current instances,
13281394// updates the controller's internal launcherData state, and returns the fresh
13291395// launcher response used for the update.
1330- func (ctl * controller ) syncLauncherInstances (ctx context.Context , nodeDat * nodeData , launcherPod * corev1.Pod ) (* AllInstancesState , error , bool ) {
1396+ func (ctl * controller ) syncLauncherInstances (ctx context.Context , nodeDat * nodeData , launcherPod * corev1.Pod ) (* launcherSyncResult , error , bool ) {
13311397 logger := klog .FromContext (ctx )
13321398
13331399 if launcherPod .Status .PodIP == "" || ! utils .IsPodReady (launcherPod ) {
@@ -1350,14 +1416,43 @@ func (ctl *controller) syncLauncherInstances(ctx context.Context, nodeDat *nodeD
13501416
13511417 launcherDat := ctl .getLauncherData (nodeDat , launcherPod .Name )
13521418 newInstances := make (map [string ]time.Time )
1419+ remainingInstances := make ([]InstanceState , 0 , len (insts .Instances ))
1420+ deletedStoppedInstanceIDs := sets .New [string ]()
1421+ failedStoppedInstanceErrs := map [string ]error {}
1422+ runningCount := 0
13531423 for _ , inst := range insts .Instances {
1424+ if inst .Status == InstanceStatusStopped {
1425+ // Clean up stopped instances from the launcher.
1426+ _ , delErr := lClient .DeleteInstance (ctx , inst .InstanceID )
1427+ if delErr != nil && ! IsInstanceNotFoundError (delErr ) {
1428+ logger .V (3 ).Info ("Failed to delete stopped instance from launcher during sync" ,
1429+ "instanceID" , inst .InstanceID , "err" , delErr )
1430+ // Deletion failed — the instance still occupies a slot in the launcher.
1431+ failedStoppedInstanceErrs [inst .InstanceID ] = delErr
1432+ } else {
1433+ logger .V (2 ).Info ("Deleted stopped instance from launcher during sync" ,
1434+ "instanceID" , inst .InstanceID )
1435+ deletedStoppedInstanceIDs .Insert (inst .InstanceID )
1436+ continue
1437+ }
1438+ }
1439+ remainingInstances = append (remainingInstances , inst )
1440+ if inst .Status == "running" {
1441+ runningCount ++
1442+ }
13541443 if lastUsed , exists := launcherDat .Instances [inst .InstanceID ]; exists {
13551444 newInstances [inst .InstanceID ] = lastUsed
13561445 } else {
13571446 newInstances [inst .InstanceID ] = time .Now ()
13581447 }
13591448 }
13601449
1450+ // Replace the returned instance list and counts with the filtered view
1451+ // so that callers (e.g. selectBestLauncherPod) see accurate capacity.
1452+ insts .Instances = remainingInstances
1453+ insts .TotalInstances = len (remainingInstances )
1454+ insts .RunningInstances = runningCount
1455+
13611456 launcherDat .Instances = newInstances
13621457 launcherDat .Accurate = true
13631458
@@ -1367,7 +1462,11 @@ func (ctl *controller) syncLauncherInstances(ctx context.Context, nodeDat *nodeD
13671462 "runningInstances" , insts .RunningInstances ,
13681463 "instanceCount" , len (newInstances ))
13691464
1370- return insts , nil , false
1465+ return & launcherSyncResult {
1466+ instances : insts ,
1467+ deletedStoppedInstanceIDs : deletedStoppedInstanceIDs ,
1468+ failedStoppedInstanceErrs : failedStoppedInstanceErrs ,
1469+ }, nil , false
13711470}
13721471
13731472func init () {
0 commit comments