@@ -266,7 +266,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
266266 if providingPod .Labels != nil {
267267 _ , providingPodLauncherBased = providingPod .Labels [ctlrcommon .LauncherConfigNameLabelKey ]
268268 }
269- err := ctl .ensureUnbound (ctx , serverDat , providingPod , providingPodLauncherBased )
269+ err := ctl .ensureUnbound (ctx , serverDat , nodeDat , providingPod , providingPodLauncherBased )
270270 if err != nil {
271271 return err , true
272272 }
@@ -334,7 +334,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
334334 if serverDat .GPUIDsStr == nil {
335335 logger .V (5 ).Info ("Querying accelerators" , "ip" , requesterIP , "port" , adminPort )
336336 url := fmt .Sprintf ("http://%s:%s%s" , requesterIP , adminPort , stubapi .AcceleratorQueryPath )
337- gpuUUIDs , err := getGPUUUIDs (url )
337+ gpuUUIDs , err := getGPUUUIDs (ctx , url )
338338 if err != nil {
339339 queryErr := fmt .Errorf ("GET %q fails: %s" , url , err .Error ())
340340 updateErr , _ := ctl .ensureReqStatus (ctx , requestingPod , serverDat , queryErr .Error ())
@@ -1118,36 +1118,115 @@ func (ctl *controller) removeProviderFinalizer(ctx context.Context, providingPod
11181118 return false , nil // no change
11191119}
11201120
1121+ func (item instanceGCItem ) process (ctx context.Context , ctl * controller , nodeDat * nodeData ) (error , bool ) {
1122+ logger := klog .FromContext (ctx ).WithValues ("iscName" , item .ISCName )
1123+
1124+ isc , err := ctl .iscLister .InferenceServerConfigs (ctl .namespace ).Get (item .ISCName )
1125+ if err != nil {
1126+ if apierrors .IsNotFound (err ) {
1127+ return nil , false
1128+ }
1129+ return err , true
1130+ }
1131+
1132+ for launcherPodName , launcherDat := range nodeDat .Launchers {
1133+ launcherPod , err := ctl .podLister .Pods (ctl .namespace ).Get (launcherPodName )
1134+ if err != nil {
1135+ if apierrors .IsNotFound (err ) {
1136+ continue
1137+ }
1138+ logger .Error (err , "Failed to get launcher pod during instance GC" , "launcherPod" , launcherPodName )
1139+ continue
1140+ }
1141+ if launcherPod .DeletionTimestamp != nil || launcherPod .Status .PodIP == "" {
1142+ continue
1143+ }
1144+ launcherBaseURL := fmt .Sprintf ("http://%s:%d" , launcherPod .Status .PodIP , ctlrcommon .LauncherServicePort )
1145+ lClient , err := NewLauncherClient (launcherBaseURL )
1146+ if err != nil {
1147+ logger .Error (err , "Failed to create launcher client during instance GC" , "launcherPod" , launcherPodName )
1148+ continue
1149+ }
1150+ allInsts , err := lClient .ListInstances (ctx )
1151+ if err != nil {
1152+ logger .Error (err , "Failed to list instances during instance GC" , "launcherPod" , launcherPodName )
1153+ continue
1154+ }
1155+ for _ , inst := range allInsts .Instances {
1156+ if inst .Annotations [VllmConfigISCNameAnnotationKey ] != isc .Name {
1157+ continue
1158+ }
1159+ if len (inst .GpuUUIDs ) == 0 {
1160+ logger .V (4 ).Info ("Skipping instance GC: no GPU UUIDs" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID )
1161+ continue
1162+ }
1163+ _ , currentHash , err := ctl .configInferenceServer (isc , inst .GpuUUIDs )
1164+ if err != nil {
1165+ logger .Error (err , "Failed to compute current hash during instance GC" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID )
1166+ continue
1167+ }
1168+ if inst .InstanceID == currentHash {
1169+ continue // not obsolete
1170+ }
1171+ sleeping , err := ctl .querySleeping (ctx , launcherPod , int16 (isc .Spec .ModelServerConfig .Port ))
1172+ if err != nil {
1173+ logger .Error (err , "Failed to query sleeping state during instance GC" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID )
1174+ continue
1175+ }
1176+ if ! sleeping {
1177+ logger .V (4 ).Info ("Skipping instance GC: instance not explicitly sleeping" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID )
1178+ continue
1179+ }
1180+ if _ , err := lClient .DeleteInstance (ctx , inst .InstanceID ); err != nil {
1181+ if ! IsInstanceNotFoundError (err ) {
1182+ logger .Error (err , "Failed to delete obsolete sleeping instance during GC" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID )
1183+ }
1184+ continue
1185+ }
1186+ delete (launcherDat .Instances , inst .InstanceID )
1187+ logger .V (2 ).Info ("Deleted obsolete sleeping instance" , "launcherPod" , launcherPodName , "instanceID" , inst .InstanceID , "currentHash" , currentHash )
1188+ }
1189+ }
1190+ return nil , false
1191+ }
1192+
11211193// Unbinds the given server-providing Pod.
1122- func (ctl * controller ) ensureUnbound (ctx context.Context , serverDat * serverData , providingPod * corev1.Pod , launcherBased bool ) error {
1194+ func (ctl * controller ) ensureUnbound (ctx context.Context , serverDat * serverData , nodeDat * nodeData , providingPod * corev1.Pod , launcherBased bool ) error {
11231195 logger := klog .FromContext (ctx )
11241196 // A providingPod with no IP is not scheduled, so we know that it is not awake.
11251197 // If providingPod is stale then the update will fail.
11261198 if (serverDat .Sleeping == nil || ! * (serverDat .Sleeping )) && providingPod .Status .PodIP != "" { // need to put to sleep
1127- serverPort := serverDat .ServerPort
1128- // TODO(waltforme): Is serverPort always set correctly for launcher-based server-providing Pods upon unbinding?
1129- // E.g. What if requestingPod is deleted during a crash and restart of the dual-pods controller?
1130- // In order to find the port in this case, I think the best effort is to recompute hash for all InferenceServerConfig objects and try to match.
1131- if ! launcherBased {
1132- if serverDat .NominalProvidingPod == nil {
1133- var err error
1134- _ , serverPort , err = utils .GetInferenceServerContainerIndexAndPort (providingPod )
1135- if err != nil { // Impossible, because such a providingPod would never be created by this controller
1136- return fmt .Errorf ("unable to put server to sleep because port not known: %w" , err )
1199+ // For launcher-based instances, check if the instance is already obsolete
1200+ // (i.e. its InferenceServerConfig was updated since the instance was created).
1201+ // If so, delete it from the launcher rather than putting it to sleep.
1202+ if launcherBased && ctl .maybeDeleteObsoleteInstance (ctx , serverDat , nodeDat , providingPod ) {
1203+ serverDat .Sleeping = ptr .To (true )
1204+ } else {
1205+ serverPort := serverDat .ServerPort
1206+ // TODO(waltforme): Is serverPort always set correctly for launcher-based server-providing Pods upon unbinding?
1207+ // E.g. What if requestingPod is deleted during a crash and restart of the dual-pods controller?
1208+ // In order to find the port in this case, I think the best effort is to recompute hash for all InferenceServerConfig objects and try to match.
1209+ if ! launcherBased {
1210+ if serverDat .NominalProvidingPod == nil {
1211+ var err error
1212+ _ , serverPort , err = utils .GetInferenceServerContainerIndexAndPort (providingPod )
1213+ if err != nil { // Impossible, because such a providingPod would never be created by this controller
1214+ return fmt .Errorf ("unable to put server to sleep because port not known: %w" , err )
1215+ }
11371216 }
11381217 }
1218+ endpoint := fmt .Sprintf ("%s:%d" , providingPod .Status .PodIP , serverPort )
1219+ sleepURL := "http://" + endpoint + "/sleep"
1220+ resp , err := http .Post (sleepURL , "" , nil )
1221+ if err != nil {
1222+ return fmt .Errorf ("failed to put provider %q to sleep, POST %s got error: %w" , serverDat .ProvidingPodName , sleepURL , err )
1223+ }
1224+ if sc := resp .StatusCode ; sc != http .StatusOK {
1225+ return fmt .Errorf ("failed to put provider %q to sleep, POST %s returned status %d" , serverDat .ProvidingPodName , sleepURL , sc )
1226+ }
1227+ serverDat .Sleeping = ptr .To (true )
1228+ logger .V (2 ).Info ("Put inference server to sleep" , "endpoint" , endpoint )
11391229 }
1140- endpoint := fmt .Sprintf ("%s:%d" , providingPod .Status .PodIP , serverPort )
1141- sleepURL := "http://" + endpoint + "/sleep"
1142- resp , err := http .Post (sleepURL , "" , nil )
1143- if err != nil {
1144- return fmt .Errorf ("failed to put provider %q to sleep, POST %s got error: %w" , serverDat .ProvidingPodName , sleepURL , err )
1145- }
1146- if sc := resp .StatusCode ; sc != http .StatusOK {
1147- return fmt .Errorf ("failed to put provider %q to sleep, POST %s returned status %d" , serverDat .ProvidingPodName , sleepURL , sc )
1148- }
1149- serverDat .Sleeping = ptr .To (true )
1150- logger .V (2 ).Info ("Put inference server to sleep" , "endpoint" , endpoint )
11511230 }
11521231 providingPod = providingPod .DeepCopy ()
11531232 var aChange , fChange bool
@@ -1227,6 +1306,64 @@ func (ctl *controller) ensureUnbound(ctx context.Context, serverDat *serverData,
12271306 return nil
12281307}
12291308
1309+ // maybeDeleteObsoleteInstance checks whether the launcher-based instance is obsolete
1310+ // (its InferenceServerConfig was updated since the instance was created) and if so,
1311+ // deletes it from the launcher. Returns true if the instance was deleted.
1312+ // On any error, returns false so the caller falls through to the normal sleep path.
1313+ func (ctl * controller ) maybeDeleteObsoleteInstance (ctx context.Context , serverDat * serverData , nodeDat * nodeData , providingPod * corev1.Pod ) bool {
1314+ logger := klog .FromContext (ctx )
1315+ if serverDat .InstanceID == "" {
1316+ return false
1317+ }
1318+ launcherBaseURL := fmt .Sprintf ("http://%s:%d" , providingPod .Status .PodIP , ctlrcommon .LauncherServicePort )
1319+ lClient , err := NewLauncherClient (launcherBaseURL )
1320+ if err != nil {
1321+ logger .V (4 ).Info ("Cannot check instance obsolescence: failed to create launcher client" , "err" , err )
1322+ return false
1323+ }
1324+ instState , err := lClient .GetInstanceState (ctx , serverDat .InstanceID )
1325+ if err != nil {
1326+ logger .V (4 ).Info ("Cannot check instance obsolescence: failed to get instance state" , "instanceID" , serverDat .InstanceID , "err" , err )
1327+ return false
1328+ }
1329+ iscName := instState .Annotations [VllmConfigISCNameAnnotationKey ]
1330+ if iscName == "" {
1331+ logger .V (4 ).Info ("Cannot check instance obsolescence: no ISC name annotation on instance" , "instanceID" , serverDat .InstanceID )
1332+ return false
1333+ }
1334+ currentISC , err := ctl .iscLister .InferenceServerConfigs (ctl .namespace ).Get (iscName )
1335+ if err != nil {
1336+ logger .V (4 ).Info ("Cannot check instance obsolescence: ISC not found" , "iscName" , iscName , "err" , err )
1337+ return false
1338+ }
1339+ if len (instState .GpuUUIDs ) == 0 {
1340+ logger .V (4 ).Info ("Cannot check instance obsolescence: no GPU UUIDs on instance" , "instanceID" , serverDat .InstanceID )
1341+ return false
1342+ }
1343+ _ , currentHash , err := ctl .configInferenceServer (currentISC , instState .GpuUUIDs )
1344+ if err != nil {
1345+ logger .V (4 ).Info ("Cannot check instance obsolescence: failed to compute current hash" , "iscName" , iscName , "err" , err )
1346+ return false
1347+ }
1348+ if currentHash == serverDat .InstanceID {
1349+ return false // not obsolete
1350+ }
1351+ // Instance is obsolete — delete from launcher instead of sleeping.
1352+ if _ , err := lClient .DeleteInstance (ctx , serverDat .InstanceID ); err != nil {
1353+ if ! IsInstanceNotFoundError (err ) {
1354+ logger .Error (err , "Failed to delete obsolete instance during unbinding" ,
1355+ "instanceID" , serverDat .InstanceID )
1356+ return false
1357+ }
1358+ }
1359+ if launcherDat := nodeDat .Launchers [providingPod .Name ]; launcherDat != nil {
1360+ delete (launcherDat .Instances , serverDat .InstanceID )
1361+ }
1362+ logger .V (2 ).Info ("Deleted obsolete instance during unbinding" ,
1363+ "instanceID" , serverDat .InstanceID , "currentHash" , currentHash , "iscName" , iscName )
1364+ return true
1365+ }
1366+
12301367// getNominalServerProvidingPod returns the nominal server-providing Pod,
12311368// which is cached in the serverData, computing the Pod if necessary.
12321369// This also ensures that the serverData fields NominalProvidingPod and NominalProvidingPodHash
@@ -1375,7 +1512,7 @@ func getReducedInferenceContainerState(from *corev1.Pod) *reducedContainerState
13751512
13761513func (ctl * controller ) querySleeping (ctx context.Context , providingPod * corev1.Pod , serverPort int16 ) (bool , error ) {
13771514 queryURL := fmt .Sprintf ("http://%s:%d/is_sleeping" , providingPod .Status .PodIP , serverPort )
1378- body , err := doGet (queryURL )
1515+ body , err := doGet (ctx , queryURL )
13791516 if err != nil {
13801517 return false , err
13811518 }
@@ -1393,7 +1530,7 @@ func (ctl *controller) accelMemoryIsLowEnough(ctx context.Context, requestingPod
13931530 adminPort = api .AdminPortDefaultValue
13941531 }
13951532 url := fmt .Sprintf ("http://%s:%s%s" , requestingPod .Status .PodIP , adminPort , stubapi .AcceleratorMemoryQueryPath )
1396- body , err := doGet (url )
1533+ body , err := doGet (ctx , url )
13971534 if err != nil {
13981535 return err
13991536 }
@@ -1598,12 +1735,16 @@ func init() {
15981735 podDecoder = codecFactory .UniversalDecoder (corev1 .SchemeGroupVersion )
15991736}
16001737
1601- func doGet (url string ) ([]byte , error ) {
1738+ func doGet (ctx context. Context , url string ) ([]byte , error ) {
16021739 client := & http.Client {
16031740 Timeout : 5 * time .Second ,
16041741 }
16051742
1606- resp , err := client .Get (url )
1743+ req , err := http .NewRequestWithContext (ctx , http .MethodGet , url , nil )
1744+ if err != nil {
1745+ return nil , fmt .Errorf ("http get %q: %w" , url , err )
1746+ }
1747+ resp , err := client .Do (req )
16071748 if err != nil {
16081749 return nil , fmt .Errorf ("http get %q: %w" , url , err )
16091750 }
@@ -1621,8 +1762,8 @@ func doGet(url string) ([]byte, error) {
16211762}
16221763
16231764// getGPUUUIDs does the HTTP GET on the given URL to fetch the assigned GPU UUIDs.
1624- func getGPUUUIDs (url string ) ([]string , error ) {
1625- body , err := doGet (url )
1765+ func getGPUUUIDs (ctx context. Context , url string ) ([]string , error ) {
1766+ body , err := doGet (ctx , url )
16261767 if err != nil {
16271768 return nil , err
16281769 }
0 commit comments