Skip to content

Commit 97174ff

Browse files
committed
Refine the deletion logic - only for unbound pods
1 parent c21c96a commit 97174ff

File tree

2 files changed

+51
-3
lines changed

2 files changed

+51
-3
lines changed

pkg/api/interface.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ const SleepingLabelName string = "dual-pods.llm-d.ai/sleeping"
116116

117117
const NominalHashAnnotationKey = "dual-pods.llm-d.ai/nominal"
118118

119+
const RequesterAnnotationKey = "dual-pods.llm-d.ai/requester"
120+
119121
// SleepState is what HTTP GET /is_sleeping on an inference server
120122
// returns (as JSON).
121123
type SleepState struct {

pkg/controller/launcher-populator/populator.go

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"encoding/json"
2424
"fmt"
2525
"os"
26+
"strings"
2627

2728
"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
2829
dualpods "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/dual-pods"
@@ -377,17 +378,62 @@ func (ctl *controller) createLaunchers(ctx context.Context, node corev1.Node, ke
377378
// deleteExcessLaunchers deletes the specified number of launcher pods
378379
func (ctl *controller) deleteExcessLaunchers(ctx context.Context, launchers []corev1.Pod, count int) error {
379380
logger := klog.FromContext(ctx)
380-
// Delete the specified number of launcher pods (starting from the end)
381-
for i := 0; i < count && i < len(launchers); i++ {
382-
pod := launchers[len(launchers)-1-i]
381+
382+
// Filter out pods that are bound to server-requesting pods
383+
// Only delete unbound launcher pods
384+
var unboundLaunchers []corev1.Pod
385+
for _, pod := range launchers {
386+
if !ctl.isLauncherBoundToServerRequestingPod(pod) {
387+
unboundLaunchers = append(unboundLaunchers, pod)
388+
} else {
389+
logger.Info("Skipping deletion of launcher pod as it is bound to a server-requesting pod",
390+
"pod", pod.Name)
391+
}
392+
}
393+
394+
// Delete the specified number of unbound launcher pods (starting from the end)
395+
actualDeleteCount := len(unboundLaunchers)
396+
if count < actualDeleteCount {
397+
actualDeleteCount = count
398+
}
399+
for i := 0; i < actualDeleteCount && i < len(unboundLaunchers); i++ {
400+
pod := unboundLaunchers[len(unboundLaunchers)-1-i]
383401
if err := ctl.coreclient.Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
384402
return fmt.Errorf("failed to delete launcher pod %s: %w", pod.Name, err)
385403
}
386404
logger.Info("Deleted launcher pod", "pod", pod.Name)
387405
}
406+
407+
if actualDeleteCount < count {
408+
logger.Info("Fewer launcher pods were deleted than requested due to bound pods",
409+
"requested", count,
410+
"deleted", actualDeleteCount,
411+
"skipped", len(unboundLaunchers)-actualDeleteCount)
412+
}
413+
388414
return nil
389415
}
390416

417+
// isLauncherBoundToServerRequestingPod checks if the launcher pod is bound to any server-requesting pod
418+
func (ctl *controller) isLauncherBoundToServerRequestingPod(launcherPod corev1.Pod) bool {
419+
// Check if the launcher pod has annotations indicating assignment to a server-requesting pod
420+
requesterAnnotationValue, exists := launcherPod.Annotations[api.RequesterAnnotationKey]
421+
if !exists {
422+
return false
423+
}
424+
425+
// Verify the format of the annotation value: should be "UID name"
426+
parts := strings.Split(requesterAnnotationValue, " ")
427+
if len(parts) != 2 {
428+
return false // Invalid format
429+
}
430+
431+
// Optionally verify that the referenced pod actually exists
432+
// @TODO if need, we can append the check logic in further PR
433+
434+
return true
435+
}
436+
391437
// buildPodFromTemplate creates a pod from a template and assigns it to a node
392438
func (ctl *controller) buildPodFromTemplate(template corev1.PodTemplateSpec, key NodeLauncherKey) *corev1.Pod {
393439
pod := &corev1.Pod{

0 commit comments

Comments
 (0)