Begin to manage the lifecycles of vLLM instance via launchers

waltforme · waltforme · commit f81f66e02bf1 · 2026-01-21T18:02:25.000Z
Signed-off-by: Jun Duan &lt;jun.duan.phd@outlook.com&gt;
diff --git a/pkg/api/interface.go b/pkg/api/interface.go
@@ -52,6 +52,16 @@ const ServerPatchAnnotationName = "dual-pods.llm-d.ai/server-patch"
 // This annotation is mutually exclusive with the 'ServerPatchAnnotationName' annotation.
 const InferenceServerConfigAnnotationName = "dual-pods.llm-d.ai/inference-server-config"
 
+// LauncherConfigHashAnnotationName is the name of an annotation on the
+// launcher-based server-providing Pod. The value of the annotation is the hash of the
+// LauncherConfig object's PodTemplate that the server-providing Pod uses.
+const LauncherConfigHashAnnotationName = "dual-pods.llm-d.ai/launcher-config-hash"
+
+// LauncherServicePort is the port number on which the launcher exposes its HTTP service
+// for the management of vLLM instances.
+// This is a contract between the dual-pods controller and the launcher implementation.
+const LauncherServicePort = 8001
+
 // StatusAnnotationName is the name of an annotation that the dual-pods controller
 // maintains reporting the ServerRequestingPodStatus. The value of this annotation is the
 // JSON rendering of the status.
diff --git a/pkg/controller/dual-pods/controller.go b/pkg/controller/dual-pods/controller.go
@@ -175,6 +175,7 @@ func (config ControllerConfig) NewController(
 	ctl.gpuMap.Store(&map[string]GpuLocation{})
 	err := ctl.podInformer.AddIndexers(cache.Indexers{
 		inferenceServerConfigIndexName: inferenceServerConfigIndexFunc,
+		launcherConfigHashIndexName:    launcherConfigHashIndexFunc,
 		requesterIndexName:             requesterIndexFunc,
 		nominalHashIndexName:           nominalHashIndexFunc,
 		GPUIndexName:                   GPUIndexFunc})
@@ -268,9 +269,10 @@ type nodeData struct {
 }
 
 type itemOnNode interface {
-	// process returns (err error, retry bool).
+	// process and processLauncherBased return (err error, retry bool).
 	// There will be a retry iff `retry`.
 	process(ctx context.Context, ctl *controller, nodeDat *nodeData) (error, bool)
+	processLauncherBased(ctx context.Context, ctl *controller, nodeDat *nodeData) (error, bool)
 }
 
 // Internal state about an inference server
@@ -305,15 +307,19 @@ type serverData struct {
 
 // nolint
 type launcherData struct {
-	// Instances is a map,
-	// where key is an instance's ID which is the instance' nominal hash,
-	// and value is the last used time of the instance.
-	Instances map[string]time.Time
+	// Instances is a map, where a key is an instance's nominal hash.
+	Instances map[string]*InstanceData
 
 	// Accurate indicates whether the set of nominal hash in Instances is accurate.
 	Accurate bool
 }
 
+type InstanceData struct {
+	// ID is the instance's UUID as assigned by the launcher.
+	ID       string
+	LastUsed time.Time
+}
+
 type queueItem interface {
 	// process returns (err error, retry bool).
 	// There will be a retry iff `retry`, error logged if `err != nil`.
@@ -371,6 +377,17 @@ func inferenceServerConfigIndexFunc(obj any) ([]string, error) {
 	return []string{inferenceServerConfigName}, nil
 }
 
+const launcherConfigHashIndexName = "launcherconfighash"
+
+func launcherConfigHashIndexFunc(obj any) ([]string, error) {
+	pod := obj.(*corev1.Pod)
+	launcherConfigHash := pod.Annotations[api.LauncherConfigHashAnnotationName]
+	if len(launcherConfigHash) == 0 {
+		return []string{}, nil
+	}
+	return []string{launcherConfigHash}, nil
+}
+
 const requesterIndexName = "requester"
 
 func requesterIndexFunc(obj any) ([]string, error) {
diff --git a/pkg/controller/dual-pods/inference-server.go b/pkg/controller/dual-pods/inference-server.go
@@ -47,6 +47,7 @@ import (
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/yaml"
 
+	fmav1alpha1 "github.com/llm-d-incubation/llm-d-fast-model-actuation/api/fma/v1alpha1"
 	"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
 	stubapi "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/spi"
 )
@@ -64,7 +65,14 @@ func (ni nodeItem) process(ctx context.Context, ctl *controller) (error, bool) {
 	logger.V(4).Info("Processing items for node", "count", len(items))
 	for localItem := range items {
 		logger.V(4).Info("Processing node-local item", "item", localItem)
-		err, retry := localItem.process(ctx, ctl, nodeDat)
+		launcherbased := true // TODO(waltforme): externalize this switch
+		var err error
+		var retry bool
+		if launcherbased {
+			err, retry = localItem.processLauncherBased(ctx, ctl, nodeDat)
+		} else {
+			err, retry = localItem.process(ctx, ctl, nodeDat)
+		}
 		if err != nil {
 			if retry {
 				logger.Info("Processing node local item suffered transient error, will retry", "item", localItem, "err", err)
@@ -401,6 +409,167 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
 	return ctl.ensureReqStatus(ctx, requestingPod, serverDat)
 }
 
+func (item infSvrItem) processLauncherBased(urCtx context.Context, ctl *controller, nodeDat *nodeData) (error, bool) {
+	logger := klog.FromContext(urCtx).WithValues("serverUID", item.UID, "requesterName", item.RequesterName)
+	ctx := klog.NewContext(urCtx, logger)
+
+	requestingPod, err := ctl.podLister.Pods(ctl.namespace).Get(item.RequesterName)
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			requestingPod = nil
+		} else {
+			logger.Error(err, "Failed to get Pod")
+			return err, true
+		}
+	} else {
+		logger = logger.WithValues("requesterRV", requestingPod.ResourceVersion)
+	}
+
+	// from the requestingPod's annotations, get the InferenceServerConfig object
+	iscName, have := requestingPod.Annotations[api.InferenceServerConfigAnnotationName]
+	if !have {
+		// TODO(waltforme): report error in the status annotation
+		// It is safe not to retry here because once the user update the annotation of requestingPod, another processing is triggered
+		return fmt.Errorf("requesting Pod %q is missing annotation %q", requestingPod.Name, api.InferenceServerConfigAnnotationName), false
+	}
+	isc, err := ctl.iscLister.InferenceServerConfigs(ctl.namespace).Get(iscName)
+	if err != nil {
+		// TODO(waltforme): report error in the status annotation
+		// It is safe not to retry here because once an event from InferenceServerConfig occurs, another processing is triggered
+		return fmt.Errorf("failed to get InferenceServerConfig %q: %w", iscName, err), false
+	}
+
+	// from the InferenceServerConfig object, get the launcherConfig object
+	lcName := isc.Spec.LauncherConfigName
+	lc, err := ctl.lcLister.LauncherConfigs(ctl.namespace).Get(lcName)
+	if err != nil {
+		// TODO(waltforme): report error in the status annotation
+		// TODO(waltforme): introduce the 'enqueue requesters by launcherconfigs' logic to the controller
+		// It is safe not to retry here because once an event from LauncherConfig occurs, another processing is triggered
+		return fmt.Errorf("failed to get LauncherConfig %q: %w", lcName, err), false
+	}
+
+	// find which launcher Pod is using this launcherConfig, then find its IP
+	lcTemplateHash, err := ctl.parseLauncherConfig(lc)
+	if err != nil {
+		return fmt.Errorf("parse LauncherConfig %q: %w", lcName, err), true
+	}
+	logger.V(5).Info("LauncherConfig's PodTemplate hash", "hash", lcTemplateHash)
+	launcherPodAnys, err := ctl.podInformer.GetIndexer().ByIndex(launcherConfigHashIndexName, lcTemplateHash)
+	if err != nil {
+		return err, false
+	}
+	if len(launcherPodAnys) == 0 {
+		// TODO(waltforme): report error in the status annotation
+		// TODO(waltforme): introduce the 'enqueue requesters by launcher Pod' logic to the controller
+		// It will be safe not to retry here because once the launcher Pod exists, another processing is triggered
+		return fmt.Errorf("no launcher Pod found for LauncherConfig %q with PodTemplate hash %q", lcName, lcTemplateHash), false
+	}
+	// Should multiple launcher Pods exist for the same LauncherConfig on one node? The answer is no.
+	// TODO(waltforme): Should we report error if multiple launcher Pods are found? Should we delete the extra ones?
+	launcherPod := launcherPodAnys[0].(*corev1.Pod)
+	logger.V(5).Info("Found launcher Pod", "name", launcherPod.Name)
+	launcherIP := launcherPod.Status.PodIP
+	if launcherIP == "" {
+		return fmt.Errorf("launcher Pod %q has no IP assigned yet", launcherPod.Name), true
+	}
+
+	// Create launcher client
+	launcherBaseURL := fmt.Sprintf("http://%s:%d", launcherIP, api.LauncherServicePort)
+	lClient, err := NewLauncherClient(launcherBaseURL)
+	if err != nil {
+		return err, true
+	}
+
+	// List vLLM instances
+	statuses, err := lClient.ListInstances(ctx)
+	if err != nil {
+		return err, true
+	}
+	logger.V(5).Info("vLLM instance counts",
+		"total_instances", statuses.TotalInstances,
+		"running_instances", statuses.RunningInstances,
+	)
+
+	// TODO(waltforme): implement the following logic:
+	// - if no instance is present for the request, create one
+	// - if an existing instance is fulfilling the request, noop
+	// - if some instances are fulfilling an obsolete request, delete them
+
+	// First, ensure a vLLM instance exists for the inferenceserverconfig object.
+	cfg, iscHash, err := ctl.parseInferenceServerConfig(isc)
+	if err != nil {
+		return fmt.Errorf("parse inference server config: %w", err), true
+	}
+	logger.V(5).Info("Nominal hash of InferenceServerConfig", "hash", iscHash)
+	InstExists := false
+	if nodeDat.Launchers == nil {
+		nodeDat.Launchers = make(map[string]*launcherData)
+	}
+	if _, have := nodeDat.Launchers[lcName]; !have {
+		nodeDat.Launchers[lcName] = &launcherData{
+			Instances: make(map[string]*InstanceData),
+			Accurate:  true,
+		}
+	}
+	launcherDat := nodeDat.Launchers[lcName]
+	for hash, inst := range launcherDat.Instances {
+		if hash == iscHash {
+			InstExists = true
+			inst.LastUsed = time.Now()
+			break
+		}
+	}
+	if !InstExists {
+		result, err := lClient.CreateInstance(ctx, *cfg)
+		if err != nil {
+			return fmt.Errorf("create vLLM instance: %w", err), true
+		}
+		logger.V(5).Info("Created new vLLM instance",
+			"instance_id", result.InstanceID,
+			"status", result.Status,
+		)
+		launcherDat.Instances[iscHash] = &InstanceData{ID: result.InstanceID, LastUsed: time.Now()}
+		nodeDat.Launchers[lcName] = launcherDat
+	}
+
+	return nil, false
+}
+
+func (ctl *controller) parseInferenceServerConfig(isc *fmav1alpha1.InferenceServerConfig) (*VllmConfig, string, error) {
+	vllmCfg := VllmConfig{
+		Options: isc.Spec.ModelServerConfig.Options,
+		EnvVars: make(map[string]interface{}, len(isc.Spec.ModelServerConfig.EnvVars)),
+	}
+	for k, v := range isc.Spec.ModelServerConfig.EnvVars {
+		vllmCfg.EnvVars[k] = v
+	}
+
+	iscBytes, err := yaml.Marshal(isc.Spec.ModelServerConfig)
+	if err != nil {
+		return nil, "", fmt.Errorf("failed to marshal InferenceServerConfig %q: %w", isc.Name, err)
+	}
+	hasher := sha256.New()
+	hasher.Write(iscBytes)
+	var hash [sha256.Size]byte
+	hashSl := hasher.Sum(hash[:0])
+	nominalHash := base64.RawStdEncoding.EncodeToString(hashSl)
+
+	return &vllmCfg, nominalHash, nil
+}
+
+func (ctl *controller) parseLauncherConfig(lc *fmav1alpha1.LauncherConfig) (string, error) {
+	podTemplateBytes, err := yaml.Marshal(lc.Spec.PodTemplate)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal LauncherConfig %q: %w", lc.Name, err)
+	}
+	hasher := sha256.New()
+	hasher.Write(podTemplateBytes)
+	var hash [sha256.Size]byte
+	hashSl := hasher.Sum(hash[:0])
+	return base64.RawStdEncoding.EncodeToString(hashSl), nil
+}
+
 func (ctl *controller) ensureSleepingLabel(ctx context.Context, providingPod *corev1.Pod, desired bool) error {
 	logger := klog.FromContext(ctx)
 	desiredStr := strconv.FormatBool(desired)