Skip to content

Commit 10a4ce1

Browse files
authored
Fix: Multi-VA E2E test failure due to missing accelerator resolution for new VAs (llm-d#922)
* fix: resolve accelerator bootstrapping deadlock for new VAs in multi-VA E2E test New VAs without prior metrics get stuck in applySaturationDecisions() because acceleratorName cannot be resolved from empty VA status or currentAllocations. This prevents HPA metric emission, creating a 3+ minute blind period where the HPA cannot scale the deployment. Add fallback accelerator resolution from deployment nodeSelector/nodeAffinity and VA label. Pre-load guidellm image into Kind cluster to eliminate runtime pull delays. Increase load job timeout from 5 to 8 minutes to account for tokenizer download on first run. * fix: use burst load for multi-VA test to trigger simulator KV cache tracking The simulator only tracks KV cache for /v1/completions requests. guidellm defaults to /v1/chat/completions, which bypasses KV cache tracking entirely. This causes avgSpareKv to remain high (0.8) despite active load, preventing the saturation engine from triggering scale-up. Switch to burst load (curl) targeting /v1/completions directly, matching the pattern used by the working smoke scale-up test. Use 2400 prompts with 400 output tokens to sustain load across multiple engine cycles. * fix: wait for scale-up instead of job completion in multi-VA E2E test The burst load jobs send 2400 requests at ~42s each, which takes ~84 minutes to complete — far exceeding the 10-minute test timeout. On Kind the lower network latency masks this, but on OpenShift the jobs always time out. Match the proven smoke test pattern: verify load jobs are running, wait for the saturation engine to detect load and scale up VA-A, then check the cost preference assertion.
1 parent c967af2 commit 10a4ce1

3 files changed

Lines changed: 100 additions & 13 deletions

File tree

deploy/kind-emulator/install.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
4242
# llm-d Configuration
4343
LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}
4444
LLM_D_INFERENCE_SIM_IMG_TAG=${LLM_D_INFERENCE_SIM_IMG_TAG:-"latest"}
45+
46+
# Load generator image (guidellm) - pre-loaded into Kind for faster e2e test startup
47+
GUIDELLM_IMG=${GUIDELLM_IMG:-"ghcr.io/vllm-project/guidellm:latest"}
4548
LLM_D_MODELSERVICE_NAME="ms-$NAMESPACE_SUFFIX-llm-d-modelservice"
4649
LLM_D_MODELSERVICE_VALUES="ms-$NAMESPACE_SUFFIX/values.yaml"
4750
LLM_D_EPP_NAME="gaie-$NAMESPACE_SUFFIX-epp"
@@ -129,6 +132,9 @@ check_specific_prerequisites() {
129132
# Load WVA image into KIND cluster
130133
load_image
131134

135+
# Pre-load guidellm image so e2e load jobs don't need to pull at runtime
136+
preload_e2e_images
137+
132138
log_success "All Kind emulated deployment prerequisites met"
133139
}
134140

@@ -206,6 +212,42 @@ load_image() {
206212
log_success "Image '$WVA_IMAGE_REPO:$WVA_IMAGE_TAG' loaded into KIND cluster '$CLUSTER_NAME'"
207213
}
208214

215+
# Pre-loads e2e test images (guidellm load generator) into the Kind cluster
216+
# so that load generation jobs start quickly without runtime image pulls.
217+
preload_e2e_images() {
218+
if [ "${PRELOAD_E2E_IMAGES:-true}" = "false" ]; then
219+
log_info "Skipping e2e image pre-loading (PRELOAD_E2E_IMAGES=false)"
220+
return
221+
fi
222+
223+
log_info "Pre-loading e2e test images into Kind cluster..."
224+
225+
local platform="${KIND_IMAGE_PLATFORM:-}"
226+
if [ -z "$platform" ]; then
227+
case "$(uname -m)" in
228+
aarch64|arm64) platform="linux/arm64" ;;
229+
*) platform="linux/amd64" ;;
230+
esac
231+
fi
232+
233+
# Pre-load guidellm image (used by CreateLoadJob in e2e tests)
234+
if docker image inspect "$GUIDELLM_IMG" >/dev/null 2>&1; then
235+
log_info "guidellm image already exists locally, loading into Kind..."
236+
else
237+
log_info "Pulling guidellm image '$GUIDELLM_IMG' (platform=$platform)..."
238+
if ! docker pull --platform "$platform" "$GUIDELLM_IMG"; then
239+
log_warning "Failed to pull guidellm image - e2e load jobs will pull at runtime (slower)"
240+
return
241+
fi
242+
fi
243+
244+
if kind load docker-image "$GUIDELLM_IMG" --name "$CLUSTER_NAME"; then
245+
log_success "guidellm image loaded into Kind cluster"
246+
else
247+
log_warning "Failed to load guidellm image into Kind - e2e load jobs will pull at runtime"
248+
fi
249+
}
250+
209251
#### REQUIRED FUNCTION used by deploy/install.sh ####
210252
create_namespaces() {
211253
log_info "Creating namespaces..."

internal/engines/saturation/engine.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,26 @@ func (e *Engine) applySaturationDecisions(
973973
} else if curr, ok := currentAllocations[vaName]; ok {
974974
acceleratorName = curr.Accelerator
975975
}
976+
977+
// Fallback for new VAs without prior status or collected metrics:
978+
// resolve accelerator from deployment nodeSelector/nodeAffinity or VA label,
979+
// and use current deployment replicas as target to avoid unintended scaling.
980+
if acceleratorName == "" {
981+
scaleTargetName := updateVa.GetScaleTargetName()
982+
if scaleTargetName != "" {
983+
var dep appsv1.Deployment
984+
if depErr := utils.GetDeploymentWithBackoff(ctx, e.client, scaleTargetName, va.Namespace, &dep); depErr == nil {
985+
acceleratorName = utils.GetAcceleratorNameFromDeployment(&updateVa, &dep)
986+
if targetReplicas == 0 && dep.Spec.Replicas != nil {
987+
targetReplicas = int(*dep.Spec.Replicas)
988+
}
989+
} else {
990+
// If deployment fetch fails, try VA label directly
991+
acceleratorName = utils.GetAcceleratorNameFromDeployment(&updateVa, nil)
992+
}
993+
}
994+
}
995+
976996
reason = "No scaling decision (optimization loop)"
977997
}
978998

test/e2e/saturation_test.go

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -425,22 +425,29 @@ var _ = Describe("Saturation Mode - Multiple VariantAutoscalings", Label("full")
425425

426426
It("should prefer cheaper variant (VA A) for scale-up when both variants are available", func() {
427427
By("Generating load to both services")
428+
// Use burst load (curl) instead of guidellm because the simulator only tracks
429+
// KV cache for /v1/completions requests. guidellm defaults to /v1/chat/completions,
430+
// which bypasses KV cache tracking and prevents saturation detection.
431+
scaleUpPrompts := 2400
432+
if cfg.NumPrompts > scaleUpPrompts {
433+
scaleUpPrompts = cfg.NumPrompts
434+
}
428435
loadCfg := fixtures.LoadConfig{
429436
Strategy: cfg.LoadStrategy,
430-
RequestRate: cfg.RequestRate,
431-
NumPrompts: cfg.NumPrompts,
437+
RequestRate: 0, // Not used for burst pattern
438+
NumPrompts: scaleUpPrompts, // Enough prompts to sustain load across multiple engine cycles
432439
InputTokens: cfg.InputTokens,
433-
OutputTokens: cfg.OutputTokens,
440+
OutputTokens: 400, // High output tokens to hold KV cache and create queue pressure
434441
ModelID: cfg.ModelID,
435442
}
436443

437-
// Create load jobs for both services
438-
targetA := fmt.Sprintf("http://%s-service:8000", modelServiceA)
439-
err := fixtures.CreateLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-a", targetA, loadCfg)
444+
// Create burst load jobs targeting /v1/completions endpoint directly
445+
targetA := fmt.Sprintf("http://%s-service.%s.svc.cluster.local:8000/v1/completions", modelServiceA, cfg.LLMDNamespace)
446+
err := fixtures.EnsureBurstLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-a", targetA, loadCfg)
440447
Expect(err).NotTo(HaveOccurred())
441448

442-
targetB := fmt.Sprintf("http://%s-service:8000", modelServiceB)
443-
err = fixtures.CreateLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-b", targetB, loadCfg)
449+
targetB := fmt.Sprintf("http://%s-service.%s.svc.cluster.local:8000/v1/completions", modelServiceB, cfg.LLMDNamespace)
450+
err = fixtures.EnsureBurstLoadJob(ctx, k8sClient, cfg.LLMDNamespace, "multi-load-b", targetB, loadCfg)
444451
Expect(err).NotTo(HaveOccurred())
445452

446453
jobNameA := "multi-load-a-load"
@@ -468,18 +475,36 @@ var _ = Describe("Saturation Mode - Multiple VariantAutoscalings", Label("full")
468475
})
469476
})
470477

471-
By("Waiting for both load jobs to complete")
478+
By("Waiting for load jobs to start running")
472479
Eventually(func(g Gomega) {
473480
jobA, err := k8sClient.BatchV1().Jobs(cfg.LLMDNamespace).Get(ctx, jobNameA, metav1.GetOptions{})
474481
g.Expect(err).NotTo(HaveOccurred())
475-
g.Expect(jobA.Status.Succeeded).To(BeNumerically(">", 0), "Job A should complete successfully")
476-
}, 5*time.Minute, 10*time.Second).Should(Succeed())
482+
g.Expect(jobA.Status.Active).To(BeNumerically(">", 0), "Job A should be running")
483+
}, 2*time.Minute, 5*time.Second).Should(Succeed())
477484

478485
Eventually(func(g Gomega) {
479486
jobB, err := k8sClient.BatchV1().Jobs(cfg.LLMDNamespace).Get(ctx, jobNameB, metav1.GetOptions{})
480487
g.Expect(err).NotTo(HaveOccurred())
481-
g.Expect(jobB.Status.Succeeded).To(BeNumerically(">", 0), "Job B should complete successfully")
482-
}, 5*time.Minute, 10*time.Second).Should(Succeed())
488+
g.Expect(jobB.Status.Active).To(BeNumerically(">", 0), "Job B should be running")
489+
}, 2*time.Minute, 5*time.Second).Should(Succeed())
490+
491+
By("Waiting for load to ramp up (30 seconds)")
492+
time.Sleep(30 * time.Second)
493+
494+
By("Waiting for VA A (cheaper) to scale up under load")
495+
// Don't wait for burst load jobs to complete — they send 2400 requests at ~42s each,
496+
// which takes much longer than the test timeout. Instead, wait for the saturation
497+
// engine to detect load and recommend scale-up, matching the smoke test pattern.
498+
Eventually(func(g Gomega) {
499+
vaAObj := &variantautoscalingv1alpha1.VariantAutoscaling{}
500+
err := crClient.Get(ctx, client.ObjectKey{Name: vaA, Namespace: cfg.LLMDNamespace}, vaAObj)
501+
g.Expect(err).NotTo(HaveOccurred())
502+
g.Expect(vaAObj.Status.DesiredOptimizedAlloc.NumReplicas).NotTo(BeNil(), "VA A NumReplicas should be set")
503+
replicasA := *vaAObj.Status.DesiredOptimizedAlloc.NumReplicas
504+
GinkgoWriter.Printf("VA A (cheaper, cost=30.0) desired replicas: %d\n", replicasA)
505+
g.Expect(replicasA).To(BeNumerically(">", 1),
506+
"VA A should scale up beyond initial replica count")
507+
}, 8*time.Minute, 15*time.Second).Should(Succeed())
483508

484509
By("Verifying VA A (cheaper) scaled up more than VA B")
485510
vaAObj := &variantautoscalingv1alpha1.VariantAutoscaling{}

0 commit comments

Comments
 (0)