Fix: Multi-VA E2E test failure due to missing accelerator resolution for new VAs (llm-d#922)

ev-shindin · web-flow · commit 10a4ce114b0e · 2026-03-23T19:09:53.000+02:00
* fix: resolve accelerator bootstrapping deadlock for new VAs in multi-VA E2E test

New VAs without prior metrics get stuck in applySaturationDecisions() because
acceleratorName cannot be resolved from empty VA status or currentAllocations.
This prevents HPA metric emission, creating a 3+ minute blind period where
the HPA cannot scale the deployment.

Add fallback accelerator resolution from deployment nodeSelector/nodeAffinity
and VA label. Pre-load guidellm image into Kind cluster to eliminate runtime
pull delays. Increase load job timeout from 5 to 8 minutes to account for
tokenizer download on first run.

* fix: use burst load for multi-VA test to trigger simulator KV cache tracking

The simulator only tracks KV cache for /v1/completions requests. guidellm
defaults to /v1/chat/completions, which bypasses KV cache tracking entirely.
This causes avgSpareKv to remain high (0.8) despite active load, preventing
the saturation engine from triggering scale-up.

Switch to burst load (curl) targeting /v1/completions directly, matching the
pattern used by the working smoke scale-up test. Use 2400 prompts with 400
output tokens to sustain load across multiple engine cycles.

* fix: wait for scale-up instead of job completion in multi-VA E2E test

The burst load jobs send 2400 requests at ~42s each, which takes ~84
minutes to complete — far exceeding the 10-minute test timeout. On Kind
the lower network latency masks this, but on OpenShift the jobs always
time out.

Match the proven smoke test pattern: verify load jobs are running, wait
for the saturation engine to detect load and scale up VA-A, then check
the cost preference assertion.
diff --git a/deploy/kind-emulator/install.sh b/deploy/kind-emulator/install.sh
@@ -42,6 +42,9 @@ POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
 # llm-d Configuration
 LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}
 LLM_D_INFERENCE_SIM_IMG_TAG=${LLM_D_INFERENCE_SIM_IMG_TAG:-"latest"}
+
+# Load generator image (guidellm) - pre-loaded into Kind for faster e2e test startup
+GUIDELLM_IMG=${GUIDELLM_IMG:-"ghcr.io/vllm-project/guidellm:latest"}
 LLM_D_MODELSERVICE_NAME="ms-$NAMESPACE_SUFFIX-llm-d-modelservice"
 LLM_D_MODELSERVICE_VALUES="ms-$NAMESPACE_SUFFIX/values.yaml"
 LLM_D_EPP_NAME="gaie-$NAMESPACE_SUFFIX-epp"
@@ -129,6 +132,9 @@ check_specific_prerequisites() {
     # Load WVA image into KIND cluster
     load_image
 
+    # Pre-load guidellm image so e2e load jobs don't need to pull at runtime
+    preload_e2e_images
+
     log_success "All Kind emulated deployment prerequisites met"
 }
 
@@ -206,6 +212,42 @@ load_image() {
     log_success "Image '$WVA_IMAGE_REPO:$WVA_IMAGE_TAG' loaded into KIND cluster '$CLUSTER_NAME'"
 }
 
+# Pre-loads e2e test images (guidellm load generator) into the Kind cluster
+# so that load generation jobs start quickly without runtime image pulls.
+preload_e2e_images() {
+    if [ "${PRELOAD_E2E_IMAGES:-true}" = "false" ]; then
+        log_info "Skipping e2e image pre-loading (PRELOAD_E2E_IMAGES=false)"
+        return
+    fi
+
+    log_info "Pre-loading e2e test images into Kind cluster..."
+
+    local platform="${KIND_IMAGE_PLATFORM:-}"
+    if [ -z "$platform" ]; then
+        case "$(uname -m)" in
+            aarch64|arm64) platform="linux/arm64" ;;
+            *) platform="linux/amd64" ;;
+        esac
+    fi
+
+    # Pre-load guidellm image (used by CreateLoadJob in e2e tests)
+    if docker image inspect "$GUIDELLM_IMG" >/dev/null 2>&1; then
+        log_info "guidellm image already exists locally, loading into Kind..."
+    else
+        log_info "Pulling guidellm image '$GUIDELLM_IMG' (platform=$platform)..."
+        if ! docker pull --platform "$platform" "$GUIDELLM_IMG"; then
+            log_warning "Failed to pull guidellm image - e2e load jobs will pull at runtime (slower)"
+            return
+        fi
+    fi
+
+    if kind load docker-image "$GUIDELLM_IMG" --name "$CLUSTER_NAME"; then
+        log_success "guidellm image loaded into Kind cluster"
+    else
+        log_warning "Failed to load guidellm image into Kind - e2e load jobs will pull at runtime"
+    fi
+}
+
 #### REQUIRED FUNCTION used by deploy/install.sh ####
 create_namespaces() {
     log_info "Creating namespaces..."
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -973,6 +973,26 @@ func (e *Engine) applySaturationDecisions(
 			} else if curr, ok := currentAllocations[vaName]; ok {
 				acceleratorName = curr.Accelerator
 			}
+
+			// Fallback for new VAs without prior status or collected metrics:
+			// resolve accelerator from deployment nodeSelector/nodeAffinity or VA label,
+			// and use current deployment replicas as target to avoid unintended scaling.
+			if acceleratorName == "" {
+				scaleTargetName := updateVa.GetScaleTargetName()
+				if scaleTargetName != "" {
+					var dep appsv1.Deployment
+					if depErr := utils.GetDeploymentWithBackoff(ctx, e.client, scaleTargetName, va.Namespace, &dep); depErr == nil {
+						acceleratorName = utils.GetAcceleratorNameFromDeployment(&updateVa, &dep)
+						if targetReplicas == 0 && dep.Spec.Replicas != nil {
+							targetReplicas = int(*dep.Spec.Replicas)
+						}
+					} else {
+						// If deployment fetch fails, try VA label directly
+						acceleratorName = utils.GetAcceleratorNameFromDeployment(&updateVa, nil)
+					}
+				}
+			}
+
 			reason = "No scaling decision (optimization loop)"
 		}
 
diff --git a/test/e2e/saturation_test.go b/test/e2e/saturation_test.go
@@ -425,22 +425,29 @@ var _ = Describe("Saturation Mode - Multiple VariantAutoscalings", Label("full")
 
 	It("should prefer cheaper variant (VA A) for scale-up when both variants are available", func() {
 		By("Generating load to both services")
+		// Use burst load (curl) instead of guidellm because the simulator only tracks
+		// KV cache for /v1/completions requests. guidellm defaults to /v1/chat/completions,
+		// which bypasses KV cache tracking and prevents saturation detection.
+		scaleUpPrompts := 2400
+		if cfg.NumPrompts > scaleUpPrompts {
+			scaleUpPrompts = cfg.NumPrompts
+		}
 		loadCfg := fixtures.LoadConfig{
 			Strategy:     cfg.LoadStrategy,
-			RequestRate:  cfg.RequestRate,
-			NumPrompts:   cfg.NumPrompts,
+			RequestRate:  0,              // Not used for burst pattern
+			NumPrompts:   scaleUpPrompts, // Enough prompts to sustain load across multiple engine cycles
 			InputTokens:  cfg.InputTokens,
-			OutputTokens: cfg.OutputTokens,
+			OutputTokens: 400, // High output tokens to hold KV cache and create queue pressure
 			ModelID:      cfg.ModelID,
 		}
 
-		// Create load jobs for both services
-		targetA := fmt.Sprintf("http://%s-service:8000", modelServiceA)
-		err := fixtures.CreateLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-a", targetA, loadCfg)
+		// Create burst load jobs targeting /v1/completions endpoint directly
+		targetA := fmt.Sprintf("http://%s-service.%s.svc.cluster.local:8000/v1/completions", modelServiceA, cfg.LLMDNamespace)
+		err := fixtures.EnsureBurstLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-a", targetA, loadCfg)
 		Expect(err).NotTo(HaveOccurred())
 
-		targetB := fmt.Sprintf("http://%s-service:8000", modelServiceB)
-		err = fixtures.CreateLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-b", targetB, loadCfg)
+		targetB := fmt.Sprintf("http://%s-service.%s.svc.cluster.local:8000/v1/completions", modelServiceB, cfg.LLMDNamespace)
+		err = fixtures.EnsureBurstLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-b", targetB, loadCfg)
 		Expect(err).NotTo(HaveOccurred())
 
 		jobNameA := "multi-load-a-load"
@@ -468,18 +475,36 @@ var _ = Describe("Saturation Mode - Multiple VariantAutoscalings", Label("full")
 				})
 		})
 
-		By("Waiting for both load jobs to complete")
+		By("Waiting for load jobs to start running")
 		Eventually(func(g Gomega) {
 			jobA, err := k8sClient.BatchV1().Jobs(cfg.LLMDNamespace).Get(ctx, jobNameA, metav1.GetOptions{})
 			g.Expect(err).NotTo(HaveOccurred())
-			g.Expect(jobA.Status.Succeeded).To(BeNumerically(">", 0), "Job A should complete successfully")
-		}, 5*time.Minute, 10*time.Second).Should(Succeed())
+			g.Expect(jobA.Status.Active).To(BeNumerically(">", 0), "Job A should be running")
+		}, 2*time.Minute, 5*time.Second).Should(Succeed())
 
 		Eventually(func(g Gomega) {
 			jobB, err := k8sClient.BatchV1().Jobs(cfg.LLMDNamespace).Get(ctx, jobNameB, metav1.GetOptions{})
 			g.Expect(err).NotTo(HaveOccurred())
-			g.Expect(jobB.Status.Succeeded).To(BeNumerically(">", 0), "Job B should complete successfully")
-		}, 5*time.Minute, 10*time.Second).Should(Succeed())
+			g.Expect(jobB.Status.Active).To(BeNumerically(">", 0), "Job B should be running")
+		}, 2*time.Minute, 5*time.Second).Should(Succeed())
+
+		By("Waiting for load to ramp up (30 seconds)")
+		time.Sleep(30 * time.Second)
+
+		By("Waiting for VA A (cheaper) to scale up under load")
+		// Don't wait for burst load jobs to complete — they send 2400 requests at ~42s each,
+		// which takes much longer than the test timeout. Instead, wait for the saturation
+		// engine to detect load and recommend scale-up, matching the smoke test pattern.
+		Eventually(func(g Gomega) {
+			vaAObj := &variantautoscalingv1alpha1.VariantAutoscaling{}
+			err := crClient.Get(ctx, client.ObjectKey{Name: vaA, Namespace: cfg.LLMDNamespace}, vaAObj)
+			g.Expect(err).NotTo(HaveOccurred())
+			g.Expect(vaAObj.Status.DesiredOptimizedAlloc.NumReplicas).NotTo(BeNil(), "VA A NumReplicas should be set")
+			replicasA := *vaAObj.Status.DesiredOptimizedAlloc.NumReplicas
+			GinkgoWriter.Printf("VA A (cheaper, cost=30.0) desired replicas: %d\n", replicasA)
+			g.Expect(replicasA).To(BeNumerically(">", 1),
+				"VA A should scale up beyond initial replica count")
+		}, 8*time.Minute, 15*time.Second).Should(Succeed())
 
 		By("Verifying VA A (cheaper) scaled up more than VA B")
 		vaAObj := &variantautoscalingv1alpha1.VariantAutoscaling{}