Route benchmark load through Gateway/EPP (full llm-d stack)

ev-shindin · ev-shindin · commit 6ff71a401625 · 2026-03-22T18:39:36.000+02:00
Change benchmark to send load through the Gateway service instead of
directly to the model service. Traffic now flows through the full
llm-d stack: Gateway → HTTPRoute → InferencePool → EPP → model pods.

The benchmark model service pods already have the
llm-d.ai/inferenceServing label, so the InferencePool discovers
them automatically.

Add GatewayServiceName/GatewayServicePort config fields (env:
GATEWAY_SERVICE_NAME, GATEWAY_SERVICE_PORT) and EPP/Gateway
readiness checks in BeforeSuite.
diff --git a/test/benchmark/benchmark_test.go b/test/benchmark/benchmark_test.go
@@ -234,13 +234,17 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
 		GinkgoWriter.Printf("Running spike phase for %v\n", spikeDuration)
 
 		spikeStart := time.Now()
-		targetURL := fmt.Sprintf("http://%s.%s.svc.cluster.local:8000/v1/completions", serviceName, benchCfg.LLMDNamespace)
+
+		// Route load through the Gateway → EPP → model server (full llm-d stack)
+		gwHost := fmt.Sprintf("%s.%s.svc.cluster.local", benchCfg.GatewayServiceName, benchCfg.LLMDNamespace)
+		targetURL := fmt.Sprintf("http://%s:%d/v1/completions", gwHost, benchCfg.GatewayServicePort)
+		GinkgoWriter.Printf("Load target URL (via Gateway): %s\n", targetURL)
 
 		By("Cleaning up any existing load jobs")
 		fixtures.DeleteParallelLoadJobs(ctx, k8sClient, jobBaseName, benchCfg.LLMDNamespace, benchLoadWorkers)
 		time.Sleep(2 * time.Second)
 
-		By("Waiting for service endpoints to exist")
+		By("Waiting for model service endpoints to exist")
 		Eventually(func(g Gomega) {
 			endpoints, err := k8sClient.CoreV1().Endpoints(benchCfg.LLMDNamespace).Get(ctx, serviceName, metav1.GetOptions{})
 			g.Expect(err).NotTo(HaveOccurred())
@@ -253,26 +257,27 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
 			g.Expect(readyCount).To(BeNumerically(">", 0))
 		}, 5*time.Minute, 10*time.Second).Should(Succeed())
 
-		By("Running in-cluster connectivity probe to diagnose load path")
+		By("Running in-cluster connectivity probe via Gateway")
 		probePodName := "bench-connectivity-probe"
 		probeScript := fmt.Sprintf(`#!/bin/sh
-echo "=== DNS Resolution ==="
-nslookup %s.%s.svc.cluster.local 2>&1 || echo "nslookup failed (tool may not exist)"
+echo "=== Gateway DNS Resolution ==="
+nslookup %s 2>&1 || echo "nslookup failed (tool may not exist)"
 echo ""
-echo "=== Service ClusterIP ==="
-getent hosts %s.%s.svc.cluster.local 2>&1 || echo "getent failed"
+echo "=== Gateway Service ClusterIP ==="
+getent hosts %s 2>&1 || echo "getent failed"
 echo ""
-echo "=== Curl verbose GET ==="
-curl -v --max-time 10 "%s" 2>&1
+echo "=== Direct model service check ==="
+HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" --max-time 10 "http://%s.%s.svc.cluster.local:8000/v1/completions" 2>/dev/null)
+echo "Direct model service HTTP status: $HTTP_CODE"
 echo ""
-echo "=== Curl verbose POST ==="
-curl -v --max-time 10 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"test","prompt":"hello","max_tokens":1}' 2>&1
+echo "=== Gateway POST (full stack path) ==="
+curl -v --max-time 15 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"%s","prompt":"hello","max_tokens":1}' 2>&1
 echo ""
-echo "=== HTTP status code only ==="
-HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" "%s" 2>/dev/null)
-echo "HTTP status code: $HTTP_CODE"
-echo "Grep test: $(echo $HTTP_CODE | grep -E '^(200|404)' && echo PASS || echo FAIL)"
-`, serviceName, benchCfg.LLMDNamespace, serviceName, benchCfg.LLMDNamespace, targetURL, targetURL, targetURL)
+echo "=== Gateway HTTP status code ==="
+HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" --max-time 15 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"%s","prompt":"test","max_tokens":1}' 2>/dev/null)
+echo "Gateway HTTP status code: $HTTP_CODE"
+echo "Grep test: $(echo $HTTP_CODE | grep -E '^(200|404|405)' && echo PASS || echo FAIL)"
+`, gwHost, gwHost, serviceName, benchCfg.LLMDNamespace, targetURL, benchCfg.ModelID, targetURL, benchCfg.ModelID)
 
 		probePod := &corev1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
diff --git a/test/benchmark/config.go b/test/benchmark/config.go
@@ -26,11 +26,13 @@ type BenchmarkConfig struct {
 	ScalerBackend string
 	KEDANamespace string
 
-	// EPP configuration
-	EPPMode          string
-	PoolName         string
-	EndpointSelector map[string]string
-	EPPServiceName   string
+	// EPP / Gateway configuration
+	EPPMode            string
+	PoolName           string
+	EndpointSelector   map[string]string
+	EPPServiceName     string
+	GatewayServiceName string
+	GatewayServicePort int
 
 	// Model configuration
 	ModelID         string
@@ -67,8 +69,10 @@ type BenchmarkConfig struct {
 func LoadConfigFromEnv() BenchmarkConfig {
 	env := getEnv("ENVIRONMENT", "kind-emulator")
 	eppServiceDefault := "gaie-inference-scheduling-epp"
+	gatewayServiceDefault := "infra-inference-scheduling-inference-gateway-istio"
 	if env == "kind-emulator" {
 		eppServiceDefault = "gaie-sim-epp"
+		gatewayServiceDefault = "infra-sim-inference-gateway-istio"
 	}
 
 	return BenchmarkConfig{
@@ -85,10 +89,12 @@ func LoadConfigFromEnv() BenchmarkConfig {
 		ScalerBackend: getEnv("SCALER_BACKEND", "prometheus-adapter"),
 		KEDANamespace: getEnv("KEDA_NAMESPACE", "keda-system"),
 
-		EPPMode:          getEnv("EPP_MODE", "poolName"),
-		PoolName:         getEnv("POOL_NAME", ""),
-		EndpointSelector: parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
-		EPPServiceName:   getEnv("EPP_SERVICE_NAME", eppServiceDefault),
+		EPPMode:            getEnv("EPP_MODE", "poolName"),
+		PoolName:           getEnv("POOL_NAME", ""),
+		EndpointSelector:   parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
+		EPPServiceName:     getEnv("EPP_SERVICE_NAME", eppServiceDefault),
+		GatewayServiceName: getEnv("GATEWAY_SERVICE_NAME", gatewayServiceDefault),
+		GatewayServicePort: getEnvInt("GATEWAY_SERVICE_PORT", 80),
 
 		ModelID:         getEnv("MODEL_ID", "unsloth/Meta-Llama-3.1-8B"),
 		AcceleratorType: getEnv("ACCELERATOR_TYPE", "H100"),
diff --git a/test/benchmark/suite_test.go b/test/benchmark/suite_test.go
@@ -52,6 +52,8 @@ var _ = BeforeSuite(func() {
 	GinkgoWriter.Printf("Environment: %s\n", benchCfg.Environment)
 	GinkgoWriter.Printf("WVA Namespace: %s\n", benchCfg.WVANamespace)
 	GinkgoWriter.Printf("LLMD Namespace: %s\n", benchCfg.LLMDNamespace)
+	GinkgoWriter.Printf("Gateway Service: %s:%d\n", benchCfg.GatewayServiceName, benchCfg.GatewayServicePort)
+	GinkgoWriter.Printf("EPP Service: %s\n", benchCfg.EPPServiceName)
 	GinkgoWriter.Printf("Results File: %s\n", benchCfg.BenchmarkResultsFile)
 	GinkgoWriter.Printf("===============================\n\n")
 
@@ -98,6 +100,32 @@ var _ = BeforeSuite(func() {
 		g.Expect(runningPods).To(BeNumerically(">", 0), "No running WVA controller pods")
 	}, 2*time.Minute, 5*time.Second).Should(Succeed(), "WVA controller should be running")
 
+	By("Verifying Gateway service exists")
+	Eventually(func(g Gomega) {
+		svc, err := k8sClient.CoreV1().Services(benchCfg.LLMDNamespace).Get(ctx, benchCfg.GatewayServiceName, metav1.GetOptions{})
+		g.Expect(err).NotTo(HaveOccurred(), "Gateway service %s not found in namespace %s", benchCfg.GatewayServiceName, benchCfg.LLMDNamespace)
+		g.Expect(svc.Spec.Ports).NotTo(BeEmpty(), "Gateway service has no ports")
+		GinkgoWriter.Printf("Gateway service %s found (type=%s)\n", svc.Name, svc.Spec.Type)
+	}, 2*time.Minute, 5*time.Second).Should(Succeed(), "Gateway service should exist")
+
+	By("Verifying EPP pods are running")
+	Eventually(func(g Gomega) {
+		pods, err := k8sClient.CoreV1().Pods(benchCfg.LLMDNamespace).List(ctx, metav1.ListOptions{
+			LabelSelector: "app.kubernetes.io/name=inferencepool",
+		})
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(pods.Items).NotTo(BeEmpty(), "No EPP pods found")
+
+		runningPods := 0
+		for _, pod := range pods.Items {
+			if pod.Status.Phase == corev1.PodRunning {
+				runningPods++
+			}
+		}
+		g.Expect(runningPods).To(BeNumerically(">", 0), "No running EPP pods")
+		GinkgoWriter.Printf("EPP: %d running pods\n", runningPods)
+	}, 2*time.Minute, 5*time.Second).Should(Succeed(), "EPP pods should be running")
+
 	By("Verifying Prometheus is available")
 	Eventually(func(g Gomega) {
 		pods, err := k8sClient.CoreV1().Pods(benchCfg.MonitoringNS).List(ctx, metav1.ListOptions{