Skip to content

Commit 6ff71a4

Browse files
committed
Route benchmark load through Gateway/EPP (full llm-d stack)
Change benchmark to send load through the Gateway service instead of directly to the model service. Traffic now flows through the full llm-d stack: Gateway → HTTPRoute → InferencePool → EPP → model pods. The benchmark model service pods already have the llm-d.ai/inferenceServing label, so the InferencePool discovers them automatically. Add GatewayServiceName/GatewayServicePort config fields (env: GATEWAY_SERVICE_NAME, GATEWAY_SERVICE_PORT) and EPP/Gateway readiness checks in BeforeSuite.
1 parent e541a2d commit 6ff71a4

3 files changed

Lines changed: 64 additions & 25 deletions

File tree

test/benchmark/benchmark_test.go

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -234,13 +234,17 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
234234
GinkgoWriter.Printf("Running spike phase for %v\n", spikeDuration)
235235

236236
spikeStart := time.Now()
237-
targetURL := fmt.Sprintf("http://%s.%s.svc.cluster.local:8000/v1/completions", serviceName, benchCfg.LLMDNamespace)
237+
238+
// Route load through the Gateway → EPP → model server (full llm-d stack)
239+
gwHost := fmt.Sprintf("%s.%s.svc.cluster.local", benchCfg.GatewayServiceName, benchCfg.LLMDNamespace)
240+
targetURL := fmt.Sprintf("http://%s:%d/v1/completions", gwHost, benchCfg.GatewayServicePort)
241+
GinkgoWriter.Printf("Load target URL (via Gateway): %s\n", targetURL)
238242

239243
By("Cleaning up any existing load jobs")
240244
fixtures.DeleteParallelLoadJobs(ctx, k8sClient, jobBaseName, benchCfg.LLMDNamespace, benchLoadWorkers)
241245
time.Sleep(2 * time.Second)
242246

243-
By("Waiting for service endpoints to exist")
247+
By("Waiting for model service endpoints to exist")
244248
Eventually(func(g Gomega) {
245249
endpoints, err := k8sClient.CoreV1().Endpoints(benchCfg.LLMDNamespace).Get(ctx, serviceName, metav1.GetOptions{})
246250
g.Expect(err).NotTo(HaveOccurred())
@@ -253,26 +257,27 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
253257
g.Expect(readyCount).To(BeNumerically(">", 0))
254258
}, 5*time.Minute, 10*time.Second).Should(Succeed())
255259

256-
By("Running in-cluster connectivity probe to diagnose load path")
260+
By("Running in-cluster connectivity probe via Gateway")
257261
probePodName := "bench-connectivity-probe"
258262
probeScript := fmt.Sprintf(`#!/bin/sh
259-
echo "=== DNS Resolution ==="
260-
nslookup %s.%s.svc.cluster.local 2>&1 || echo "nslookup failed (tool may not exist)"
263+
echo "=== Gateway DNS Resolution ==="
264+
nslookup %s 2>&1 || echo "nslookup failed (tool may not exist)"
261265
echo ""
262-
echo "=== Service ClusterIP ==="
263-
getent hosts %s.%s.svc.cluster.local 2>&1 || echo "getent failed"
266+
echo "=== Gateway Service ClusterIP ==="
267+
getent hosts %s 2>&1 || echo "getent failed"
264268
echo ""
265-
echo "=== Curl verbose GET ==="
266-
curl -v --max-time 10 "%s" 2>&1
269+
echo "=== Direct model service check ==="
270+
HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" --max-time 10 "http://%s.%s.svc.cluster.local:8000/v1/completions" 2>/dev/null)
271+
echo "Direct model service HTTP status: $HTTP_CODE"
267272
echo ""
268-
echo "=== Curl verbose POST ==="
269-
curl -v --max-time 10 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"test","prompt":"hello","max_tokens":1}' 2>&1
273+
echo "=== Gateway POST (full stack path) ==="
274+
curl -v --max-time 15 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"%s","prompt":"hello","max_tokens":1}' 2>&1
270275
echo ""
271-
echo "=== HTTP status code only ==="
272-
HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" "%s" 2>/dev/null)
273-
echo "HTTP status code: $HTTP_CODE"
274-
echo "Grep test: $(echo $HTTP_CODE | grep -E '^(200|404)' && echo PASS || echo FAIL)"
275-
`, serviceName, benchCfg.LLMDNamespace, serviceName, benchCfg.LLMDNamespace, targetURL, targetURL, targetURL)
276+
echo "=== Gateway HTTP status code ==="
277+
HTTP_CODE=$(curl -s -o /dev/null -w "%%{http_code}" --max-time 15 -X POST "%s" -H "Content-Type: application/json" -d '{"model":"%s","prompt":"test","max_tokens":1}' 2>/dev/null)
278+
echo "Gateway HTTP status code: $HTTP_CODE"
279+
echo "Grep test: $(echo $HTTP_CODE | grep -E '^(200|404|405)' && echo PASS || echo FAIL)"
280+
`, gwHost, gwHost, serviceName, benchCfg.LLMDNamespace, targetURL, benchCfg.ModelID, targetURL, benchCfg.ModelID)
276281

277282
probePod := &corev1.Pod{
278283
ObjectMeta: metav1.ObjectMeta{

test/benchmark/config.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@ type BenchmarkConfig struct {
2626
ScalerBackend string
2727
KEDANamespace string
2828

29-
// EPP configuration
30-
EPPMode string
31-
PoolName string
32-
EndpointSelector map[string]string
33-
EPPServiceName string
29+
// EPP / Gateway configuration
30+
EPPMode string
31+
PoolName string
32+
EndpointSelector map[string]string
33+
EPPServiceName string
34+
GatewayServiceName string
35+
GatewayServicePort int
3436

3537
// Model configuration
3638
ModelID string
@@ -67,8 +69,10 @@ type BenchmarkConfig struct {
6769
func LoadConfigFromEnv() BenchmarkConfig {
6870
env := getEnv("ENVIRONMENT", "kind-emulator")
6971
eppServiceDefault := "gaie-inference-scheduling-epp"
72+
gatewayServiceDefault := "infra-inference-scheduling-inference-gateway-istio"
7073
if env == "kind-emulator" {
7174
eppServiceDefault = "gaie-sim-epp"
75+
gatewayServiceDefault = "infra-sim-inference-gateway-istio"
7276
}
7377

7478
return BenchmarkConfig{
@@ -85,10 +89,12 @@ func LoadConfigFromEnv() BenchmarkConfig {
8589
ScalerBackend: getEnv("SCALER_BACKEND", "prometheus-adapter"),
8690
KEDANamespace: getEnv("KEDA_NAMESPACE", "keda-system"),
8791

88-
EPPMode: getEnv("EPP_MODE", "poolName"),
89-
PoolName: getEnv("POOL_NAME", ""),
90-
EndpointSelector: parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
91-
EPPServiceName: getEnv("EPP_SERVICE_NAME", eppServiceDefault),
92+
EPPMode: getEnv("EPP_MODE", "poolName"),
93+
PoolName: getEnv("POOL_NAME", ""),
94+
EndpointSelector: parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
95+
EPPServiceName: getEnv("EPP_SERVICE_NAME", eppServiceDefault),
96+
GatewayServiceName: getEnv("GATEWAY_SERVICE_NAME", gatewayServiceDefault),
97+
GatewayServicePort: getEnvInt("GATEWAY_SERVICE_PORT", 80),
9298

9399
ModelID: getEnv("MODEL_ID", "unsloth/Meta-Llama-3.1-8B"),
94100
AcceleratorType: getEnv("ACCELERATOR_TYPE", "H100"),

test/benchmark/suite_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ var _ = BeforeSuite(func() {
5252
GinkgoWriter.Printf("Environment: %s\n", benchCfg.Environment)
5353
GinkgoWriter.Printf("WVA Namespace: %s\n", benchCfg.WVANamespace)
5454
GinkgoWriter.Printf("LLMD Namespace: %s\n", benchCfg.LLMDNamespace)
55+
GinkgoWriter.Printf("Gateway Service: %s:%d\n", benchCfg.GatewayServiceName, benchCfg.GatewayServicePort)
56+
GinkgoWriter.Printf("EPP Service: %s\n", benchCfg.EPPServiceName)
5557
GinkgoWriter.Printf("Results File: %s\n", benchCfg.BenchmarkResultsFile)
5658
GinkgoWriter.Printf("===============================\n\n")
5759

@@ -98,6 +100,32 @@ var _ = BeforeSuite(func() {
98100
g.Expect(runningPods).To(BeNumerically(">", 0), "No running WVA controller pods")
99101
}, 2*time.Minute, 5*time.Second).Should(Succeed(), "WVA controller should be running")
100102

103+
By("Verifying Gateway service exists")
104+
Eventually(func(g Gomega) {
105+
svc, err := k8sClient.CoreV1().Services(benchCfg.LLMDNamespace).Get(ctx, benchCfg.GatewayServiceName, metav1.GetOptions{})
106+
g.Expect(err).NotTo(HaveOccurred(), "Gateway service %s not found in namespace %s", benchCfg.GatewayServiceName, benchCfg.LLMDNamespace)
107+
g.Expect(svc.Spec.Ports).NotTo(BeEmpty(), "Gateway service has no ports")
108+
GinkgoWriter.Printf("Gateway service %s found (type=%s)\n", svc.Name, svc.Spec.Type)
109+
}, 2*time.Minute, 5*time.Second).Should(Succeed(), "Gateway service should exist")
110+
111+
By("Verifying EPP pods are running")
112+
Eventually(func(g Gomega) {
113+
pods, err := k8sClient.CoreV1().Pods(benchCfg.LLMDNamespace).List(ctx, metav1.ListOptions{
114+
LabelSelector: "app.kubernetes.io/name=inferencepool",
115+
})
116+
g.Expect(err).NotTo(HaveOccurred())
117+
g.Expect(pods.Items).NotTo(BeEmpty(), "No EPP pods found")
118+
119+
runningPods := 0
120+
for _, pod := range pods.Items {
121+
if pod.Status.Phase == corev1.PodRunning {
122+
runningPods++
123+
}
124+
}
125+
g.Expect(runningPods).To(BeNumerically(">", 0), "No running EPP pods")
126+
GinkgoWriter.Printf("EPP: %d running pods\n", runningPods)
127+
}, 2*time.Minute, 5*time.Second).Should(Succeed(), "EPP pods should be running")
128+
101129
By("Verifying Prometheus is available")
102130
Eventually(func(g Gomega) {
103131
pods, err := k8sClient.CoreV1().Pods(benchCfg.MonitoringNS).List(ctx, metav1.ListOptions{

0 commit comments

Comments
 (0)