Test cases for multiple instances sharing one launcher

waltforme · waltforme · commit ccb7a55cf9c8 · 2026-03-02T10:07:37.000Z
diff --git a/pkg/controller/dual-pods/inference-server.go b/pkg/controller/dual-pods/inference-server.go
@@ -472,7 +472,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
 		// then those with capacity for new instances.
 		// Note that multiple vLLM instances could exist in one launcher Pod, but at most one instance could be awake at a time.
 
-		launcherPod, hasSleepingInstance, someNotReady, err := ctl.selectBestLauncherPod(ctx, launcherPodAnys, iscHash, nodeDat)
+		launcherPod, hasSleepingInstance, someNotReady, err := ctl.selectBestLauncherPod(ctx, launcherPodAnys, iscHash, int(lc.Spec.MaxSleepingInstances), nodeDat)
 		if err != nil {
 			return err, true
 		}
@@ -510,6 +510,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
 				return ctl.bind(ctx, serverDat, requestingPod, launcherPod, true, int16(isc.Spec.ModelServerConfig.Port))
 			} else {
 				// Slower path: create new instance in launcher with capacity
+				logger.V(5).Info("Creating new vLLM instance", "iscHash", iscHash)
 				result, err := lClient.CreateNamedInstance(ctx, iscHash, *cfg)
 				if err != nil {
 					return fmt.Errorf("create vLLM instance: %w", err), true
@@ -564,6 +565,7 @@ func (ctl *controller) selectBestLauncherPod(
 	ctx context.Context,
 	launcherPodAnys []interface{},
 	iscHash string,
+	maxOthers int,
 	nodeDat *nodeData,
 ) (*corev1.Pod, bool, bool, error) {
 	logger := klog.FromContext(ctx)
@@ -614,8 +616,7 @@ func (ctl *controller) selectBestLauncherPod(
 		}
 
 		// Check if this launcher has capacity for a new instance
-		// A launcher has capacity if it has zero instances (can host at least one)
-		if insts.TotalInstances == 0 && candidateWithCapacity == nil {
+		if insts.TotalInstances <= maxOthers && candidateWithCapacity == nil {
 			// Priority 2: Has capacity for new instance
 			logger.V(5).Info("Found launcher with capacity for new instance",
 				"name", launcherPod.Name,
diff --git a/test/e2e/mkobjs.sh b/test/e2e/mkobjs.sh
@@ -8,15 +8,60 @@ if out=$(kubectl apply -f - 2>&1 <<EOF
 apiVersion: fma.llm-d.ai/v1alpha1
 kind: InferenceServerConfig
 metadata:
-  name: inference-server-config-$inst
+  name: inference-server-config-smol-$inst
+  labels:
+    instance: "$inst"
 spec:
   modelServerConfig:
     port: 8005
+    options: "--model HuggingFaceTB/SmolLM2-360M-Instruct"
+    env_vars:
+      VLLM_SERVER_DEV_MODE: "1"
+      VLLM_USE_V1: "1"
+      VLLM_LOGGING_LEVEL: "DEBUG"
+      VLLM_CPU_KVCACHE_SPACE: "1" # GiB
+    labels:
+      component: inference
+    annotations:
+      description: "Example InferenceServerConfig"
+  launcherConfigName: launcher-config-$inst
+---
+apiVersion: fma.llm-d.ai/v1alpha1
+kind: InferenceServerConfig
+metadata:
+  name: inference-server-config-qwen-$inst
+  labels:
+    instance: "$inst"
+spec:
+  modelServerConfig:
+    port: 8006
+    options: "--model Qwen/Qwen2.5-0.5B-Instruct"
+    env_vars:
+      VLLM_SERVER_DEV_MODE: "1"
+      VLLM_USE_V1: "1"
+      VLLM_LOGGING_LEVEL: "DEBUG"
+      VLLM_CPU_KVCACHE_SPACE: "1" # GiB
+    labels:
+      component: inference
+    annotations:
+      description: "Example InferenceServerConfig"
+  launcherConfigName: launcher-config-$inst
+---
+apiVersion: fma.llm-d.ai/v1alpha1
+kind: InferenceServerConfig
+metadata:
+  name: inference-server-config-tinyllama-$inst
+  labels:
+    instance: "$inst"
+spec:
+  modelServerConfig:
+    port: 8007
     options: "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0"
     env_vars:
       VLLM_SERVER_DEV_MODE: "1"
       VLLM_USE_V1: "1"
       VLLM_LOGGING_LEVEL: "DEBUG"
+      VLLM_CPU_KVCACHE_SPACE: "1" # GiB
     labels:
       component: inference
     annotations:
@@ -27,8 +72,10 @@ apiVersion: fma.llm-d.ai/v1alpha1
 kind: LauncherConfig
 metadata:
   name: launcher-config-$inst
+  labels:
+    instance: "$inst"
 spec:
-  maxSleepingInstances: 3
+  maxSleepingInstances: 1
   podTemplate:
     spec:
       containers:
@@ -63,7 +110,7 @@ spec:
         instance: "$inst"
       annotations:
         dual-pods.llm-d.ai/admin-port: "8081"
-        dual-pods.llm-d.ai/inference-server-config: "inference-server-config-$inst"
+        dual-pods.llm-d.ai/inference-server-config: "inference-server-config-smol-$inst"
     spec:
       containers:
         - name: inference-server
@@ -105,9 +152,12 @@ spec:
 EOF
         )
 then
-    echo inference-server-config-$inst
+    # output to be parsed by caller, e.g. the e2e test script
+    echo inference-server-config-smol-$inst
     echo launcher-config-$inst
     echo my-request-$inst
+    echo inference-server-config-qwen-$inst
+    echo inference-server-config-tinyllama-$inst
 else
     echo Failed to create objects >&2
     echo "$out" >&2
diff --git a/test/e2e/run-launcher-based.sh b/test/e2e/run-launcher-based.sh
@@ -165,6 +165,7 @@ objs=$(test/e2e/mkobjs.sh)
 isc=$(echo $objs | awk '{print $1}')
 lc=$(echo $objs | awk '{print $2}')
 rslb=$(echo $objs | awk '{print $3}')
+isc2=$(echo $objs | awk '{print $4}')
 instlb=${rslb#my-request-}
 
 # Expect requester pod to be created
@@ -240,10 +241,103 @@ kubectl wait --for condition=Ready pod/$launcherlb --timeout=5s
 
 cheer Successful instance wake-up fast path
 
+: Multiple Instances Share One Launcher
+
+# Scale requester to 0 again
+kubectl scale rs $rslb --replicas=0
+
+expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | grep -c '^pod/' || true | grep -w 0"
+! kubectl get pod $reqlb2
+
+# Launcher should remain
+kubectl get pod $launcherlb
+
+# Verify launcher is unbound
+expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'
+
+# Patch ReplicaSet to use isc2 instead of isc
+kubectl patch rs $rslb --type=json -p='[{"op": "replace", "path": "/spec/template/metadata/annotations/dual-pods.llm-d.ai~1inference-server-config", "value": "'$isc2'"}]'
+
+sleep 5
+
+# Scale back up (should reuse same launcher and create 2nd instance)
+kubectl scale rs $rslb --replicas=1
+
+expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | grep -c '^pod/' | grep -w 1"
+
+reqlb3=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
+
+# Should still be using the same launcher pod
+launcherlb3=$(kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | sed s%pod/%%)
+[ "$launcherlb3" == "$launcherlb" ]
+
+# Verify new requester is bound to same launcher
+expect '[ "$(kubectl get pod $reqlb3 -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'
+
+# Verify launcher is bound to new requester
+expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb3" ]'
+
+# Verify the new requester is using isc2
+expect '[ "$(kubectl get pod $reqlb3 -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/inference-server-config})" == "'$isc2'" ]'
+
+# Wait for requester to be ready (launcher should already be ready)
+date
+kubectl wait --for condition=Ready pod/$reqlb3 --timeout=30s
+kubectl wait --for condition=Ready pod/$launcherlb --timeout=5s
+
+cheer Successful multiple instances sharing one launcher
+
+: Switch Instances In One Launcher
+
+# Scale requester to 0 again
+kubectl scale rs $rslb --replicas=0
+
+expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | grep -c '^pod/' || true | grep -w 0"
+! kubectl get pod $reqlb3
+
+# Launcher should remain
+kubectl get pod $launcherlb
+
+# Verify launcher is unbound
+expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'
+
+# Patch ReplicaSet back to use original isc
+kubectl patch rs $rslb --type=json -p='[{"op": "replace", "path": "/spec/template/metadata/annotations/dual-pods.llm-d.ai~1inference-server-config", "value": "'$isc'"}]'
+
+sleep 5
+
+# Scale back up (should reuse same launcher and wake first instance)
+kubectl scale rs $rslb --replicas=1
+
+expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | grep -c '^pod/' | grep -w 1"
+
+reqlb4=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
+
+# Should still be using the same launcher pod
+launcherlb4=$(kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | sed s%pod/%%)
+[ "$launcherlb4" == "$launcherlb" ]
+
+# Verify new requester is bound to same launcher
+expect '[ "$(kubectl get pod $reqlb4 -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'
+
+# Verify launcher is bound to new requester
+expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb4" ]'
+
+# Verify the new requester is using original isc
+expect '[ "$(kubectl get pod $reqlb4 -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/inference-server-config})" == "'$isc'" ]'
+
+# Wait for requester to be ready (launcher should already be ready)
+date
+kubectl wait --for condition=Ready pod/$reqlb4 --timeout=30s
+kubectl wait --for condition=Ready pod/$launcherlb --timeout=5s
+
+cheer Successful switching instances in one launcher
+
 : Clean up launcher-based workloads
 
 kubectl delete rs $rslb --ignore-not-found=true
 kubectl delete inferenceserverconfig $isc --ignore-not-found=true
+kubectl delete inferenceserverconfig $isc2 --ignore-not-found=true
 kubectl delete launcherconfig $lc --ignore-not-found=true
 expect '[ $(kubectl get pods -o name | grep -c "^pod/my-request-") == "0" ]'