fix e2e tests

asm582 · asm582 · commit 9b94f0c62a6a · 2026-01-26T23:26:43.000-05:00
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@ IMAGE_TAG_BASE ?= ghcr.io/llm-d
 IMG_TAG ?= latest
 IMG ?= $(IMAGE_TAG_BASE)/workload-variant-autoscaler:$(IMG_TAG)
 KIND_ARGS ?= -t mix -n 3 -g 2   # Default: 3 nodes, 2 GPUs per node, mixed vendors
-CLUSTER_GPU_TYPE ?= mix
+CLUSTER_GPU_TYPE ?= nvidia-mix
 CLUSTER_NODES ?= 3
 CLUSTER_GPUS ?= 4
 KUBECONFIG ?= $(HOME)/.kube/config
@@ -155,7 +155,7 @@ test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated
 	}
 	$(eval FOCUS_ARGS := $(if $(FOCUS),-ginkgo.focus="$(FOCUS)",))
 	$(eval SKIP_ARGS := $(if $(SKIP),-ginkgo.skip="$(SKIP)",))
-	export COLLECTOR_V2=1 KUBECONFIG=$(KUBECONFIG) K8S_EXPECTED_VERSION=$(K8S_VERSION) && go test ./test/e2e-saturation-based/ -timeout 50m -v -ginkgo.v $(FOCUS_ARGS) $(SKIP_ARGS)
+	export COLLECTOR_V2=1 KUBECONFIG=$(KUBECONFIG) K8S_EXPECTED_VERSION=$(K8S_VERSION) && go test ./test/e2e-saturation-based/ -timeout 60m -v -ginkgo.v $(FOCUS_ARGS) $(SKIP_ARGS)
 
 # E2E tests on OpenShift cluster
 # Supports KUBECONFIG or in-cluster authentication (for self-hosted runners).
diff --git a/internal/engines/pipeline/default_limiter_test.go b/internal/engines/pipeline/default_limiter_test.go
@@ -98,7 +98,7 @@ func (m *mockTypeAllocator) Remaining() int {
 
 // mockAlgorithm implements AllocationAlgorithm for testing
 type mockAlgorithm struct {
-	name       string
+	name         string
 	allocateFunc func(ctx context.Context, decisions []*interfaces.VariantDecision, allocator ResourceAllocator) error
 }
 
diff --git a/internal/engines/pipeline/greedy_saturation_algorithm_test.go b/internal/engines/pipeline/greedy_saturation_algorithm_test.go
@@ -162,7 +162,7 @@ var _ = Describe("GreedyBySaturation", func() {
 				Expect(err).NotTo(HaveOccurred())
 
 				// Only 3 GPUs available, 2 GPUs per replica = 1 replica can be added
-				Expect(decisions[0].GPUsAllocated).To(Equal(2)) // Only full replicas count
+				Expect(decisions[0].GPUsAllocated).To(Equal(2))  // Only full replicas count
 				Expect(decisions[0].TargetReplicas).To(Equal(2)) // 1 + 1 replica
 				Expect(decisions[0].WasLimited).To(BeTrue())
 			})
diff --git a/test/e2e-saturation-based/e2e_saturation_suite_test.go b/test/e2e-saturation-based/e2e_saturation_suite_test.go
@@ -57,7 +57,7 @@ var (
 const (
 	maximumAvailableGPUs = 4
 	numNodes             = 3
-	gpuTypes             = "mix"
+	gpuTypes             = "nvidia-mix"
 
 	kindClusterName = "wva-gpu-cluster"
 )
diff --git a/test/e2e-saturation-based/e2e_scale_from_zero_test.go b/test/e2e-saturation-based/e2e_scale_from_zero_test.go
@@ -73,7 +73,7 @@ var _ = Describe("Test workload-variant-autoscaler - Scale-From-Zero Feature", O
 		appLabel = name
 		namespace = llmDNamespace
 		port = 8000
-		modelName = llamaModelId + "-sfz"
+		modelName = llamaModelId
 		gatewayService = "infra-sim-inference-gateway"
 
 		// Start with 0 replicas to test scale-from-zero
diff --git a/test/e2e-saturation-based/e2e_scale_to_zero_test.go b/test/e2e-saturation-based/e2e_scale_to_zero_test.go
@@ -283,8 +283,8 @@ retention_period: %s`, modelName, retentionPeriodShort),
 				namespace,
 				fmt.Sprintf("http://%s:%d", gatewayName, 80),
 				modelName,
-				loadRatePerSecond,
-				maxExecutionTimeSec,
+				5,  // Reduced rate (was loadRatePerSecond=8)
+				10, // Drastically reduced duration to prevent queue backlog (was 60s)
 				inputTokens,
 				outputTokens,
 				k8sClient,
diff --git a/test/utils/e2eutils.go b/test/utils/e2eutils.go
@@ -283,7 +283,7 @@ func IsCertManagerCRDsInstalled() bool {
 
 // LoadImageToKindClusterWithName loads a local docker image to the kind cluster
 func LoadImageToKindClusterWithName(name string, maxGPUs int) error {
-	cluster, err := CheckIfClusterExistsOrCreate(maxGPUs)
+	cluster, err := CheckIfClusterExistsOrCreate(maxGPUs, "mix")
 	if err != nil {
 		return err
 	}
@@ -293,7 +293,7 @@ func LoadImageToKindClusterWithName(name string, maxGPUs int) error {
 	return err
 }
 
-func CheckIfClusterExistsOrCreate(maxGPUs int) (string, error) {
+func CheckIfClusterExistsOrCreate(maxGPUs int, gpuType string) (string, error) {
 	// Check if the kind cluster exists
 	existsCmd := exec.Command("kind", "get", "clusters")
 	output, err := Run(existsCmd)
@@ -312,7 +312,7 @@ func CheckIfClusterExistsOrCreate(maxGPUs int) (string, error) {
 	// Create the kind cluster if it doesn't exist
 	expectedVersion := os.Getenv("K8S_EXPECTED_VERSION")
 	if !clusterExists {
-		scriptCmd := exec.Command("bash", "deploy/kind-emulator/setup.sh", "-g", fmt.Sprintf("%d", maxGPUs), "K8S_VERSION="+expectedVersion)
+		scriptCmd := exec.Command("bash", "deploy/kind-emulator/setup.sh", "-g", fmt.Sprintf("%d", maxGPUs), "-t", gpuType, "K8S_VERSION="+expectedVersion)
 		if _, err := Run(scriptCmd); err != nil {
 			return "", fmt.Errorf("failed to create kind cluster: %v", err)
 		}
@@ -1240,7 +1240,7 @@ func SetupTestEnvironment(image string, numNodes, gpusPerNode int, gpuTypes stri
 	gom.Expect(os.Setenv("CLUSTER_NAME", clusterName)).To(gom.Succeed())
 	setEnvIfNotSet("CLUSTER_NODES", fmt.Sprintf("%d", numNodes))
 	setEnvIfNotSet("CLUSTER_GPUS", fmt.Sprintf("%d", gpusPerNode))
-	setEnvIfNotSet("CLUSTER_GPU_TYPE", gpuTypes) // Use CLUSTER_GPU_TYPE to match Makefile
+	setEnvIfNotSet("CLUSTER_GPU_TYPE", gpuTypes)                                     // Use CLUSTER_GPU_TYPE to match Makefile
 	gom.Expect(os.Setenv("WVA_IMAGE_PULL_POLICY", "IfNotPresent")).To(gom.Succeed()) // The image is built locally by the tests
 	gom.Expect(os.Setenv("CREATE_CLUSTER", "true")).To(gom.Succeed())                // Always create a new cluster for E2E tests
 
diff --git a/test/utils/resources/llmdsim.go b/test/utils/resources/llmdsim.go
@@ -53,6 +53,7 @@ func CreateLlmdSimDeployment(namespace, deployName, modelName, appLabel, port st
 								"--enable-kvcache",
 								"--kv-cache-size=1024",
 								"--block-size=16",
+								"--tokenizers-cache-dir=/tmp",
 							},
 							Env: []corev1.EnvVar{
 								{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{
@@ -67,6 +68,12 @@ func CreateLlmdSimDeployment(namespace, deployName, modelName, appLabel, port st
 										FieldPath:  "metadata.namespace",
 									},
 								}},
+								{Name: "POD_IP", ValueFrom: &corev1.EnvVarSource{
+									FieldRef: &corev1.ObjectFieldSelector{
+										APIVersion: "v1",
+										FieldPath:  "status.podIP",
+									},
+								}},
 							},
 							Ports: []corev1.ContainerPort{
 								{ContainerPort: 8000, Name: "http", Protocol: corev1.ProtocolTCP},
@@ -105,6 +112,7 @@ func CreateLlmdSimDeploymentWithGPU(namespace, deployName, modelName, appLabel,
 			"--enable-kvcache",
 			"--kv-cache-size=1024",
 			"--block-size=16",
+			"--tokenizers-cache-dir=/tmp",
 		},
 		Env: []corev1.EnvVar{
 			{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{
@@ -119,6 +127,12 @@ func CreateLlmdSimDeploymentWithGPU(namespace, deployName, modelName, appLabel,
 					FieldPath:  "metadata.namespace",
 				},
 			}},
+			{Name: "POD_IP", ValueFrom: &corev1.EnvVarSource{
+				FieldRef: &corev1.ObjectFieldSelector{
+					APIVersion: "v1",
+					FieldPath:  "status.podIP",
+				},
+			}},
 		},
 		Ports: []corev1.ContainerPort{
 			{ContainerPort: 8000, Name: "http", Protocol: corev1.ProtocolTCP},
@@ -185,6 +199,19 @@ func CreateLlmdSimDeploymentWithGPUAndNodeSelector(
 
 	if len(nodeSelector) > 0 {
 		deployment.Spec.Template.Spec.NodeSelector = nodeSelector
+		// Add tolerations for control-plane nodes as H100s might be on control-plane in kind-emulator
+		deployment.Spec.Template.Spec.Tolerations = []corev1.Toleration{
+			{
+				Key:      "node-role.kubernetes.io/control-plane",
+				Operator: corev1.TolerationOpExists,
+				Effect:   corev1.TaintEffectNoSchedule,
+			},
+			{
+				Key:      "node-role.kubernetes.io/master",
+				Operator: corev1.TolerationOpExists,
+				Effect:   corev1.TaintEffectNoSchedule,
+			},
+		}
 	}
 
 	return deployment

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ func (m *mockTypeAllocator) Remaining() int {`
`98`	`98`
`99`	`99`	`// mockAlgorithm implements AllocationAlgorithm for testing`
`100`	`100`	`type mockAlgorithm struct {`
`101`		`- name string`
	`101`	`+ name string`
`102`	`102`	`allocateFunc func(ctx context.Context, decisions []*interfaces.VariantDecision, allocator ResourceAllocator) error`
`103`	`103`	`}`
`104`	`104`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ var (`
`57`	`57`	`const (`
`58`	`58`	`maximumAvailableGPUs = 4`
`59`	`59`	`numNodes = 3`
`60`		`- gpuTypes = "mix"`
	`60`	`+ gpuTypes = "nvidia-mix"`
`61`	`61`
`62`	`62`	`kindClusterName = "wva-gpu-cluster"`
`63`	`63`	`)`