diff --git a/.github/workflows/ci-pr-checks.yaml b/.github/workflows/ci-pr-checks.yaml index 2afb92d737..767b4d12dc 100644 --- a/.github/workflows/ci-pr-checks.yaml +++ b/.github/workflows/ci-pr-checks.yaml @@ -450,4 +450,4 @@ jobs: E2E_LABEL_FILTER: ${{ matrix.suite.label-filter }} LOAD_VLLM_RENDER_IMAGE: ${{ matrix.suite.needs-renderer }} PULL_VLLM_RENDER_IMAGE: "false" - run: make test-e2e-scheduler-run + run: make test-e2e-router-run diff --git a/Makefile b/Makefile index 55d1359fc5..1424e39046 100644 --- a/Makefile +++ b/Makefile @@ -304,24 +304,24 @@ test-e2e-gaie-run: image-pull ## Ensure images are present, then run GAIE e2e te $(CONTAINER_RUNTIME) run $(BUILDER_RUN_FLAGS) $(BUILDER_E2E_FLAGS) \ -e EPP_IMAGE=$(GAIE_E2E_IMAGE) \ -e USE_KIND=true \ - $(BUILDER_IMAGE) ./hack/test-e2e.sh + $(BUILDER_IMAGE) ./test/scripts/test-e2e-gaie.sh .PHONY: test-e2e-gaie test-e2e-gaie: image-build-builder image-build ## Build images and run GAIE e2e tests $(MAKE) test-e2e-gaie-run -.PHONY: test-e2e-scheduler-run -test-e2e-scheduler-run: image-pull ## Ensure images are present, then run scheduler e2e tests +.PHONY: test-e2e-router-run +test-e2e-router-run: image-pull ## Ensure images are present, then run router e2e tests @printf "\033[33;1m==== Running End to End Tests ====\033[0m\n" $(CONTAINER_RUNTIME) run $(BUILDER_RUN_FLAGS) $(BUILDER_E2E_FLAGS) \ - $(BUILDER_IMAGE) ./test/scripts/run_e2e.sh + $(BUILDER_IMAGE) ./test/scripts/test-e2e-router.sh -.PHONY: test-e2e-scheduler -test-e2e-scheduler: image-build-builder image-build ## Build images and run scheduler e2e tests - $(MAKE) test-e2e-scheduler-run +.PHONY: test-e2e-router +test-e2e-router: image-build-builder image-build ## Build images and run router e2e tests + $(MAKE) test-e2e-router-run .PHONY: test-e2e -test-e2e: test-e2e-gaie test-e2e-scheduler ## Run all end-to-end tests sequentially +test-e2e: test-e2e-gaie test-e2e-router ## Run all end-to-end tests sequentially .PHONY: bench-tokenizer diff --git a/README.md b/README.md index 3ba9804703..4d8fa1dc80 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ A lightweight deployment where a self-managed Envoy proxy runs alongside the EPP ### 2. Gateway Mode (Inference Gateway) The recommended mode for production environments, leveraging the official [Gateway API]. In this mode, the EPP acts as a backend for an `InferencePool`, which is referenced by an `HTTPRoute` on a shared `Gateway`. This enables advanced traffic management, multi-cluster load balancing, and shared infrastructure for both inference and traditional workloads. -For more details on the router architecture, routing logic, and different plugins (filters and scorers), see the [Architecture Documentation]. +For more details on the router architecture, routing logic, and different plugins (filters and scorers), see the [Architecture Documentation]. For resource provisioning and container sizing recommendations under heavy or long-context workloads, see the [EPP Container Sizing Guide]. --- @@ -61,6 +61,7 @@ To ensure clarity across the project, we use the following standard terminology: [Kubernetes Gateway API]:https://gateway-api.sigs.k8s.io/ [Architecture Documentation]:docs/architecture.md [Disaggregation Documentation]:docs/disaggregation.md +[EPP Container Sizing Guide]:docs/operations.md [InferencePool]:https://github.com/kubernetes-sigs/gateway-api-inference-extension [Gateway API Inference Extension (GIE)]:https://github.com/kubernetes-sigs/gateway-api-inference-extension [Kubernetes Gateway API Inference Extensions]:https://github.com/kubernetes-sigs/gateway-api-inference-extension diff --git a/config/charts/README.md b/config/charts/README.md index fe7d176b5b..c30ef6441a 100644 --- a/config/charts/README.md +++ b/config/charts/README.md @@ -32,15 +32,14 @@ helm install my-standalone-router ./config/charts/llm-d-router-standalone \ --set router.modelServers.matchLabels.app=my-vllm-service ``` -#### Standalone with Agentgateway Proxy (Service-Backed) -Deploys EPP with an Agentgateway proxy. This mode requires disabling the `InferencePool` resource creation (`create=false`) and routes traffic to an existing Kubernetes Service: +#### Standalone with Agentgateway Proxy +Deploys EPP with an Agentgateway proxy. This mode requires disabling the `InferencePool` resource creation (`create=false`) and routes traffic directly to model servers: ```bash helm install my-standalone-router ./config/charts/llm-d-router-standalone \ --set router.inferencePool.create=false \ --set router.proxy.proxyType=agentgateway \ - --set router.proxy.agentgateway.service.name=my-model-service \ - --set router.proxy.agentgateway.service.ports="8000" + --set router.modelServers.matchLabels.app=my-model-service ``` #### Standalone with a Separate Proxy Service @@ -538,20 +537,21 @@ Configures EPP to run with a proxy (Envoy proxy or Agentgateway proxy) that inte | `router.proxy.volumeMounts` | Sidecar container volume mounts. | `[]` | | `router.proxy.volumes` | Sidecar container volumes. | `[]` | | `router.proxy.configMapData` | Key-value pairs to include in a ConfigMap created for the sidecar. | `{}` | -| `router.proxy.agentgateway.service.create` | **Agentgateway only**. Create a dedicated model Service for the Agentgateway proxy. | `true` | -| `router.proxy.agentgateway.service.name` | **Agentgateway only**. Name of the model Service to route to. | `""` | -| `router.proxy.agentgateway.service.namespace` | **Agentgateway only**. Namespace of the model Service. Defaults to release namespace. | `""` | -| `router.proxy.agentgateway.service.ports` | **Agentgateway only**. Port list for the model Service (must match `modelServers.targetPorts`). | `[]` | +#### Complete Standalone Example with Agentgateway Proxy -#### Complete Proxy Sidecar Example (Agentgateway Service-Backed) - -To deploy EPP in standalone mode with an Agentgateway sidecar routing traffic directly to an existing model Service `my-model-service` (bypassing `InferencePool` creation): +To deploy EPP in standalone mode with an Agentgateway sidecar routing traffic directly to model servers matching the label `app=my-model-service` (bypassing `InferencePool` creation): ```yaml router: inferencePool: create: false # Disable InferencePool creation + modelServers: + matchLabels: + app: "my-model-service" + targetPorts: + - number: 8000 + proxy: enabled: true proxyType: agentgateway @@ -561,10 +561,4 @@ router: memory: 4Gi limits: memory: 8Gi - agentgateway: - service: - create: true # Create a Service to route client traffic to EPP - name: "my-model-service" - ports: - - 8000 # Intercept traffic on port 8000 ``` diff --git a/config/charts/llm-d-router-standalone/templates/_validations.tpl b/config/charts/llm-d-router-standalone/templates/_validations.tpl index 21a11ac26f..633df10fc1 100644 --- a/config/charts/llm-d-router-standalone/templates/_validations.tpl +++ b/config/charts/llm-d-router-standalone/templates/_validations.tpl @@ -18,6 +18,13 @@ standalone validations {{- if not (or (eq $proxyMode "sidecar") (eq $proxyMode "service")) -}} {{- fail (printf ".Values.router.proxy.mode must be one of [sidecar, service], got %q" $proxyMode) -}} {{- end -}} +{{- /* Without an InferencePool the EPP --endpoint-selector is rendered from modelServers.matchLabels; an empty selector is rejected by EPP at startup, so require it here. */ -}} +{{- $useInferencePool := ne .Values.router.inferencePool.create false -}} +{{- if not $useInferencePool -}} + {{- if or (empty .Values.router.modelServers) (not .Values.router.modelServers.matchLabels) -}} + {{- fail ".Values.router.modelServers.matchLabels is required when .Values.router.inferencePool.create=false: standalone mode renders the EPP --endpoint-selector from matchLabels and cannot start with an empty selector" -}} + {{- end -}} +{{- end -}} {{- $failOpen := index $proxy "failOpen" -}} {{- if and (not (kindIs "invalid" $failOpen)) (not (kindIs "bool" $failOpen)) -}} {{- fail (printf ".Values.router.proxy.failOpen must be a boolean, got %q" (toString $failOpen)) -}} @@ -46,32 +53,17 @@ standalone validations {{- fail (printf ".Values.router.proxy.proxyType must be one of [envoy, agentgateway], got %q" $proxyType) -}} {{- end -}} {{- if eq $proxyType "agentgateway" -}} + {{- if hasKey $proxy "agentgateway" -}} + {{- fail ".Values.router.proxy.agentgateway is no longer supported; standalone agentgateway uses EPP endpoint discovery with a logical service backend" -}} + {{- end -}} {{- if ne .Values.router.inferencePool.create false -}} {{- fail ".Values.router.inferencePool.create=false is required when proxyType=agentgateway; standalone agentgateway currently supports only service-backed routing" -}} {{- end -}} - {{- $agentgateway := index $proxy "agentgateway" | default dict -}} - {{- $service := index $agentgateway "service" | default dict -}} - {{- $serviceName := index $service "name" | default "" -}} - {{- $serviceCreate := index $service "create" | default true -}} - {{- if hasKey $service "port" -}} - {{- fail ".Values.router.proxy.agentgateway.service.port has been replaced by .Values.router.proxy.agentgateway.service.ports" -}} - {{- end -}} - {{- if empty $serviceName -}} - {{- fail ".Values.router.proxy.agentgateway.service.name is required when proxyType=agentgateway" -}} - {{- end -}} - {{- $targetPorts := include "llm-d-router.standaloneEndpointTargetPorts" . -}} - {{- $servicePorts := include "llm-d-router.agentgateway.modelServicePorts" . -}} - {{- if ne $targetPorts $servicePorts -}} - {{- fail (printf ".Values.router.proxy.agentgateway.service.ports must match .Values.router.modelServers.targetPorts when proxyType=agentgateway, got service ports %q and target ports %q" $servicePorts $targetPorts) -}} - {{- end -}} {{- $listenerPort := include "llm-d-router.standaloneProxyListenerPort" . -}} {{- $flags := .Values.router.epp.flags | default dict -}} {{- if and (hasKey $flags "secure-serving") (ne (toString (index $flags "secure-serving")) "false") -}} {{- fail ".Values.router.epp.flags.secure-serving must be false when proxyType=agentgateway; standalone agentgateway uses plaintext gRPC to EPP over localhost" -}} {{- end -}} - {{- if $serviceCreate -}} - {{- $selectorLabels := include "llm-d-router.agentgateway.modelServiceSelectorLabels" . -}} - {{- end -}} {{- end -}} {{- end -}} {{- end -}} diff --git a/config/charts/llm-d-router-standalone/templates/agentgateway-service.yaml b/config/charts/llm-d-router-standalone/templates/agentgateway-service.yaml deleted file mode 100644 index 8f55b87017..0000000000 --- a/config/charts/llm-d-router-standalone/templates/agentgateway-service.yaml +++ /dev/null @@ -1,32 +0,0 @@ -{{- $proxy := .Values.router.proxy | default dict -}} -{{- $proxyType := default "envoy" ($proxy.proxyType | default "envoy") | lower -}} -{{- $agentgateway := index $proxy "agentgateway" | default dict -}} -{{- $service := index $agentgateway "service" | default dict -}} -{{- $serviceCreate := index $service "create" | default true -}} -{{- $serviceName := index $service "name" | default "" -}} -{{- if and $proxy.enabled (eq $proxyType "agentgateway") $serviceCreate (not (empty $serviceName)) -}} -{{- $serviceNamespace := index $service "namespace" | default .Release.Namespace -}} -{{- $servicePorts := splitList "," (include "llm-d-router.agentgateway.modelServicePorts" .) -}} -apiVersion: v1 -kind: Service -metadata: - name: {{ $serviceName | quote }} - namespace: {{ $serviceNamespace | quote }} - labels: - app.kubernetes.io/component: agentgateway-model-service - app.kubernetes.io/part-of: {{ include "llm-d-router.name" . | quote }} - {{- if .Chart.AppVersion }} - app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} - {{- end }} -spec: - selector: - {{- include "llm-d-router.agentgateway.modelServiceSelectorLabels" . | trim | nindent 4 }} - ports: - {{- range $servicePort := $servicePorts }} - - name: http-{{ $servicePort }} - port: {{ $servicePort }} - protocol: TCP - targetPort: {{ $servicePort }} - {{- end }} - type: ClusterIP -{{- end -}} diff --git a/config/charts/llm-d-router-standalone/values.yaml b/config/charts/llm-d-router-standalone/values.yaml index f4461fdb05..14c572ff65 100644 --- a/config/charts/llm-d-router-standalone/values.yaml +++ b/config/charts/llm-d-router-standalone/values.yaml @@ -33,16 +33,7 @@ router: limits: memory: 16Gi - # Agentgateway-specific settings used by the built-in preset when - # proxyType=agentgateway. service.name is required. - agentgateway: - service: - create: true - name: "" - namespace: "" - # Must match inferencePool.targetPorts. - ports: - - 8000 + # Built-in standalone proxy presets. The selected preset is merged with the # top-level proxy.* fields below, so explicit user overrides still win. diff --git a/config/charts/routerlib/templates/_helpers.tpl b/config/charts/routerlib/templates/_helpers.tpl index a27b9059dc..86870f48e7 100644 --- a/config/charts/routerlib/templates/_helpers.tpl +++ b/config/charts/routerlib/templates/_helpers.tpl @@ -269,13 +269,20 @@ Return the standalone EPP model-server target ports. {{- end -}} {{/* -Return the agentgateway model Service ports. +Return the agentgateway standalone logical backend service name. +Derives the name from .Values.router.modelServers.matchLabels.app, +falling back to .Release.Name if not set. */}} -{{- define "llm-d-router.agentgateway.modelServicePorts" -}} -{{- $proxyValues := .Values.router.proxy | default dict -}} -{{- $agentgateway := index $proxyValues "agentgateway" | default dict -}} -{{- $service := index $agentgateway "service" | default dict -}} -{{- include "llm-d-router.normalizedPortList" (dict "path" ".Values.router.proxy.agentgateway.service.ports" "value" (index $service "ports")) -}} +{{- define "llm-d-router.agentgateway.logicalBackendName" -}} +{{- $appLabel := "" -}} +{{- if and .Values.router.modelServers .Values.router.modelServers.matchLabels -}} + {{- $appLabel = index .Values.router.modelServers.matchLabels "app" | default "" -}} +{{- end -}} +{{- if not (empty $appLabel) -}} + {{- $appLabel -}} +{{- else -}} + {{- .Release.Name -}} +{{- end -}} {{- end -}} {{/* @@ -329,30 +336,15 @@ Return the rendered proxy ConfigMap data. {{- toYaml $data -}} {{- end -}} -{{/* -Render labels from the standalone endpoint selector for the generated model Service. -Only equality-based selectors are supported because Service selectors are a map. -*/}} -{{- define "llm-d-router.agentgateway.modelServiceSelectorLabels" -}} -{{- if and .Values.router.modelServers .Values.router.modelServers.matchLabels -}} -{{- range $key, $value := .Values.router.modelServers.matchLabels -}} -{{- printf "%s: %s\n" ($key | quote) ($value | quote) -}} -{{- end -}} -{{- else -}} - {{- fail ".Values.modelServers.matchLabels is required when creating an agentgateway model Service" -}} -{{- end -}} -{{- end -}} + {{/* Render the default standalone agentgateway proxy config template. */}} {{- define "llm-d-router.proxy.agentgatewayConfig" -}} -{{- $proxyValues := .Values.router.proxy | default dict -}} -{{- $agentgateway := index $proxyValues "agentgateway" | default dict -}} -{{- $service := index $agentgateway "service" | default dict -}} -{{- $serviceName := index $service "name" | default "" -}} -{{- $serviceNamespace := index $service "namespace" | default .Release.Namespace -}} -{{- $servicePorts := splitList "," (include "llm-d-router.agentgateway.modelServicePorts" .) -}} +{{- $serviceName := include "llm-d-router.agentgateway.logicalBackendName" . -}} +{{- $serviceNamespace := .Release.Namespace -}} +{{- $servicePorts := splitList "," (include "llm-d-router.standaloneEndpointTargetPorts" .) -}} {{- $backendPort := index $servicePorts 0 -}} {{- $listenerPort := include "llm-d-router.standaloneProxyListenerPort" . | int -}} config: diff --git a/docs/architecture.md b/docs/architecture.md index 17044ebbde..66f2fb9269 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -306,3 +306,4 @@ Enable chunked decode via the pd-sidecar flag: - [GIE Spec](../README.md#relation-to-gie-igw) - [Envoy External Processing](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) +- [EPP Container Sizing Guide](./operations.md) diff --git a/docs/operations.md b/docs/operations.md new file mode 100644 index 0000000000..f6df65b080 --- /dev/null +++ b/docs/operations.md @@ -0,0 +1,136 @@ +# llm-d Router Container Sizing Guide + +This guide provides resource sizing recommendations for both the Endpoint Picker (EPP) and the Envoy Proxy containers in the llm-d Router. Sizing recommendations are based on empirical benchmark results under various agentic and high-throughput workloads. + +--- + +## 1. Endpoint Picker (EPP) Sizing + +The EPP acts as the routing intelligence engine. Its resource usage scales primarily with the total request rate (throughput), the complexity of prefix cache matching configuration, and the number of model-serving pods. + +### Sizing Recommendations + +#### CPU Allocation +- **Rule of Thumb**: Allocate **0.5 to 1.0 CPU cores per request/second** of expected throughput for large agentic workloads (approximately 100k input / 1k output tokens). +- **Scaling Behavior**: CPU utilization scales linearly with the request rate, and increases with both the input prompt size and output token length. +- **Prefix Matching Overhead**: Increasing the `maxPrefixBlocksToMatch` parameter increases EPP CPU utilization. At lower throughputs, a large prefix block limit (such as 6250 blocks) can increase EPP CPU utilization by over 100% compared to a small limit (256 blocks) due to the overhead of searching and matching blocks. +- **Idle CPU Scaling**: Idle CPU usage of the EPP container scales with the number of model-serving pods in the cluster due to continuous metric scraping. For example, in a cluster with 100 model-serving pods, the idle CPU usage of the EPP container grows to approximately **7.5 cores**. + +#### Memory Allocation +- **Base Memory**: EPP memory usage is relatively low and stable with small output token requests, but scales with the number of concurrent inflight requests. +- **Inflight Requests Impact**: Memory usage increases with the number of concurrent inflight requests and the output (decode) token length. +- **Sizing Guidelines**: + - For a request rate of 50 to 100 requests/second with 1k output tokens, EPP requires between **4 GiB and 6 GiB** of memory. + - For workloads with longer output lengths (such as 5k output tokens), memory usage can reach **20+ GiB** due to the accumulation of state for concurrent inflight requests. + +#### Scaling Modes (Active-Active vs. Active-Passive) +The EPP's scaling behavior and effectiveness are highly dependent on the configured high availability (HA) mode: + +- **Active-Passive Mode**: Only one EPP replica actively serves Envoy external processing (`ext-proc`) requests at a time, while the others remain in standby. + - **Sizing Impact**: Scaling the replica count does **not** increase the overall EPP throughput capacity or impact resource sizing, as only the active replica handles requests. +- **Active-Active Mode**: Multiple EPP replicas actively share and load-balance incoming requests, providing **near-linear throughput scaling**: + + | Replicas | Scaling Factor | + | :--- | :--- | + | 1 | 1.0x | + | 2 | 2.0x | + | 3 | 2.7x | + | 4 | 3.5x | + + - **Warning (Prefix Routing)**: **Active-Active mode should be avoided when using approximate prefix routing.** Because EPP replicas do not share prefix state, each replica only has visibility into the prefix state of the requests it has individually handled. This partition of state significantly degrades prefix cache hit rates, making prefix caching highly inefficient. + - For more technical details and context on EPP replica state sync and scaling limitations, see [Issue #1290](https://github.com/llm-d/llm-d-router/issues/1290). + +### Performance Reference Data + +The following tables present empirical benchmark results for EPP running with llm-d-simulator simulating Qwen/Qwen3-8B. + +#### Throughput and Prefix Block Sizing +This table shows peak CPU and memory utilization for EPP under a 100k token workload (95k system prompt, 5k question prompt, and 1k output tokens) when using approximate prefix caching across 100 model-serving pods. + +| Configuration | Request Rate (Req/s) | maxPrefixBlocksToMatch | Peak CPU (Cores) | Peak Memory (GiB) | Scheduler P50 Latency (s) | +| :--- | :--- | :--- | :--- | :--- | :--- | +| Small Prefix Match | 5.0 | 256 | 1.19 | 0.26 | 0.00010 | +| Large Prefix Match | 5.0 | 6250 | 3.82 | 0.65 | 0.00010 | +| Small Prefix Match | 98.7 | 256 | 35.17 | 2.46 | 0.00014 | +| Large Prefix Match | 98.8 | 6250 | 46.50 | 3.41 | 0.00020 | + +Configuration used: [#1287](https://github.com/llm-d/llm-d-router/issues/1287#issuecomment-4666058475). +These were run against 0.9.0 EPP container image. + +#### Output Length and Prefix Matching Complexity +This table shows EPP peak resource usage at a constant request rate of 50 requests/second with a 100k input token workload, varying the output token length and the `maxPrefixBlocksToMatch` configuration. + +| Input Tokens | Output Tokens | maxPrefixBlocksToMatch | Peak CPU (Cores) | Peak Memory (GiB) | +| :--- | :--- | :--- | :--- | :--- | +| 100k | 500 | 256 | 15.13 | 2.27 | +| 100k | 500 | 2048 | 17.14 | 3.76 | +| 100k | 1000 | 256 | 17.51 | 3.66 | +| 100k | 1000 | 2048 | 20.28 | 5.23 | +| 100k | 5000 | 1024 | 30.95 | 12.54 | +| 100k | 10000 | 512 | 32.53 | 12.54 | + +Configuration used: [#1287](https://github.com/llm-d/llm-d-router/issues/1287#issuecomment-4619775397) +These were run against 0.9.0 EPP container image. + +--- + +## 2. Envoy Proxy Sizing (Standalone Mode) + +When running the llm-d Router in **Standalone Mode**, the Envoy proxy container runs in the same pod alongside the EPP container. Sizing the Envoy proxy container depends primarily on the request throughput (requests/second) and the request/response payload size (concurrency of streaming data). + +### Sizing Recommendations + +#### CPU Allocation +- **Scaling Behavior**: Envoy's CPU usage scales linearly with the total throughput (requests/second). +- **Sizing Guidelines**: + - For lower throughput (e.g., < 10 requests/second), **1.2 to 2.0 CPU cores** is sufficient. + - For higher throughput of large contexts (e.g., 100 requests/second with 100k/1k tokens), allocate at least **8 CPU cores** (peak usage observed at **7.27 cores**). + - For very high throughput of smaller contexts (e.g., 892 requests/second with 10k/1k tokens), allocate at least **10 CPU cores** (peak usage observed at **8.78 cores**). + +#### Memory Allocation +- **Sizing Guidelines**: Envoy's memory footprint remains extremely stable and is primarily influenced by the number of concurrent active connections and buffer sizes. Allocate at least **2 GiB of memory** (peak memory usage is stable between **1.3 and 1.4 GiB** across all tested throughputs and context lengths). + +### Performance Reference Data + +The following table presents empirical benchmark results for the Envoy proxy container in Standalone Mode under different workloads: + +| Input Tokens | Output Tokens | Throughput (Req/s) | Peak CPU (Cores) | Peak Memory (GiB) | +| :--- | :--- | :--- | :--- | :--- | +| 100k | 1k | 10.0 | 1.20 | 1.30 | +| 100k | 1k | 100.0 | 7.27 | < 1.40 | +| 10k | 1k | 892.0 | 8.78 | 1.40 | + +--- + +## 3. Helm Configuration Example + +For deployments managed via Helm (such as using the `llm-d-router-standalone` chart), both the EPP and the Envoy proxy container resource requests and limits can be configured in a custom values file, such as `resource_overrides.yaml`. + +Below is an example `resource_overrides.yaml` snippet configured to support a throughput of up to 50 requests/second for 100k/1k agentic requests in Standalone Mode: + +```yaml +router: + # Endpoint Picker (EPP) Container Resources + epp: + resources: + requests: + cpu: "32" + memory: "64Gi" + limits: + memory: "128Gi" + + # Envoy Proxy Container Resources + proxy: + resources: + requests: + cpu: "8" + memory: "2Gi" + limits: + memory: "4Gi" +``` + +To apply these values during deployment, run the Helm install or upgrade command with your custom values file: + +```bash +helm install optimize-baseline ./config/charts/llm-d-router-standalone -f resource_overrides.yaml +``` diff --git a/go.mod b/go.mod index 8099892bb0..63bbac45e5 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,7 @@ require ( github.com/google/uuid v1.6.0 github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/jellydator/ttlcache/v3 v3.4.0 - github.com/llm-d/llm-d-kv-cache v0.9.0-rc.1 + github.com/llm-d/llm-d-kv-cache v0.9.0 github.com/onsi/ginkgo/v2 v2.28.3 github.com/onsi/gomega v1.40.0 github.com/openai/openai-go v1.12.0 diff --git a/go.sum b/go.sum index 0fa9ca50d2..83ccb3af74 100644 --- a/go.sum +++ b/go.sum @@ -153,8 +153,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/llm-d/llm-d-kv-cache v0.9.0-rc.1 h1:wg/lxAkdbIqVCuhY9gYoJJTXZtGGYTj73B5VAclbMFE= -github.com/llm-d/llm-d-kv-cache v0.9.0-rc.1/go.mod h1:dxtEGCGrKV7PwRhpSAOfJi3rf8NOhUeL4RNXW945HNU= +github.com/llm-d/llm-d-kv-cache v0.9.0 h1:6hLLJGrP91A28LYH2Gd5i81e5sreSSHizdlSnB9NXqc= +github.com/llm-d/llm-d-kv-cache v0.9.0/go.mod h1:dxtEGCGrKV7PwRhpSAOfJi3rf8NOhUeL4RNXW945HNU= github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= diff --git a/hack/push-chart.sh b/hack/push-chart.sh index 779dccf013..bd0e5dfe54 100755 --- a/hack/push-chart.sh +++ b/hack/push-chart.sh @@ -50,14 +50,14 @@ then exit 1 fi ${YQ} -i \ - '.latencyPredictor.trainingServer.image.registry=strenv(IMAGE_REGISTRY) | - .latencyPredictor.trainingServer.image.repository="llm-d-latency-predictor-training-server" | - .latencyPredictor.trainingServer.image.tag=strenv(LATENCY_PREDICTOR_TAG) | - .latencyPredictor.trainingServer.image.pullPolicy="IfNotPresent" | - .latencyPredictor.predictionServers.image.registry=strenv(IMAGE_REGISTRY) | - .latencyPredictor.predictionServers.image.repository="llm-d-latency-predictor-prediction-server" | - .latencyPredictor.predictionServers.image.tag=strenv(LATENCY_PREDICTOR_TAG) | - .latencyPredictor.predictionServers.image.pullPolicy="IfNotPresent"' \ + '.router.latencyPredictor.trainingServer.image.registry=strenv(IMAGE_REGISTRY) | + .router.latencyPredictor.trainingServer.image.repository="llm-d-latency-predictor-training-server" | + .router.latencyPredictor.trainingServer.image.tag=strenv(LATENCY_PREDICTOR_TAG) | + .router.latencyPredictor.trainingServer.image.pullPolicy="IfNotPresent" | + .router.latencyPredictor.predictionServers.image.registry=strenv(IMAGE_REGISTRY) | + .router.latencyPredictor.predictionServers.image.repository="llm-d-latency-predictor-prediction-server" | + .router.latencyPredictor.predictionServers.image.tag=strenv(LATENCY_PREDICTOR_TAG) | + .router.latencyPredictor.predictionServers.image.pullPolicy="IfNotPresent"' \ config/charts/${CHART}/values.yaml if [[ ${CHART} == "llm-d-router-standalone" ]]; then ${YQ} -i \ diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh index e099510560..106dd2edca 100755 --- a/hack/verify-helm.sh +++ b/hack/verify-helm.sh @@ -114,7 +114,7 @@ test_cases_llm_d_router_standalone["basic"]="--set router.modelServers.matchLabe test_cases_llm_d_router_standalone["gke-provider"]="--set provider.name=gke --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false" test_cases_llm_d_router_standalone["latency-predictor"]="--set router.latencyPredictor.enabled=true --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false" test_cases_llm_d_router_standalone["llm-d-router-gateway"]="--set router.inferencePool.create=true --set router.modelServers.matchLabels.app=llm-instance-gateway" -test_cases_llm_d_router_standalone["agentgateway"]="--set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8000' --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000'" +test_cases_llm_d_router_standalone["agentgateway"]="--set router.proxy.proxyType=agentgateway --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000'" test_cases_llm_d_router_standalone["proxy-service"]="--set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set router.proxy.mode=service --set router.proxy.replicas=3" test_cases_llm_d_router_standalone["triton"]="--set router.modelServers.type=triton --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false" @@ -148,6 +148,13 @@ for key in "${!test_cases_llm_d_router_standalone[@]}"; do done echo "Running llm-d-router-standalone negative validation tests..." +missing_endpoint_selector_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.inferencePool.create=false --set router.modelServers.type=vllm --set 'router.modelServers.targetPorts[0].number=8000' >/dev/null" +echo "Executing: ${missing_endpoint_selector_command}" +if eval "${missing_endpoint_selector_command}"; then + echo "Helm template unexpectedly succeeded for inferencePool.create=false without modelServers.matchLabels" + exit 1 +fi + invalid_proxy_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set router.proxy.proxyType=bogus >/dev/null" echo "Executing: ${invalid_proxy_command}" if eval "${invalid_proxy_command}"; then @@ -155,35 +162,28 @@ if eval "${invalid_proxy_command}"; then exit 1 fi -missing_agentgateway_service_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set router.proxy.proxyType=agentgateway >/dev/null" -echo "Executing: ${missing_agentgateway_service_command}" -if eval "${missing_agentgateway_service_command}"; then - echo "Helm template unexpectedly succeeded for missing agentgateway service.name" +deprecated_agentgateway_service_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=foo >/dev/null" +echo "Executing: ${deprecated_agentgateway_service_command}" +if eval "${deprecated_agentgateway_service_command}"; then + echo "Helm template unexpectedly succeeded for deprecated agentgateway.service configuration" exit 1 fi -unsupported_agentgateway_llm_d_router_gateway_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8000' --set router.inferencePool.create=true --set router.modelServers.matchLabels.app=llm-instance-gateway >/dev/null" +unsupported_agentgateway_llm_d_router_gateway_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.inferencePool.create=true --set router.modelServers.matchLabels.app=llm-instance-gateway >/dev/null" echo "Executing: ${unsupported_agentgateway_llm_d_router_gateway_command}" if eval "${unsupported_agentgateway_llm_d_router_gateway_command}"; then echo "Helm template unexpectedly succeeded for unsupported agentgateway createInferencePool=true configuration" exit 1 fi -mismatched_agentgateway_ports_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8001' --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' >/dev/null" -echo "Executing: ${mismatched_agentgateway_ports_command}" -if eval "${mismatched_agentgateway_ports_command}"; then - echo "Helm template unexpectedly succeeded for mismatched agentgateway service.ports" - exit 1 -fi - -unsupported_agentgateway_listener_port_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8000' --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=proxy' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=9000' >/dev/null" +unsupported_agentgateway_listener_port_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=proxy' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=9000' >/dev/null" echo "Executing: ${unsupported_agentgateway_listener_port_command}" if eval "${unsupported_agentgateway_listener_port_command}"; then echo "Helm template unexpectedly succeeded without an agentgateway listener Service port named http" exit 1 fi -mismatched_agentgateway_listener_target_port_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8000' --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=http' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=9001' >/dev/null" +mismatched_agentgateway_listener_target_port_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=http' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=9001' >/dev/null" echo "Executing: ${mismatched_agentgateway_listener_target_port_command}" if eval "${mismatched_agentgateway_listener_target_port_command}"; then echo "Helm template unexpectedly succeeded for an agentgateway listener targetPort that does not match port" @@ -216,7 +216,7 @@ fi echo "Verifying llm-d-router-standalone agentgateway renders plaintext EPP and custom listener ports..." agentgateway_render_output="${TEMP_DIR}/llm-d-router-standalone-agentgateway-render.yaml" -agentgateway_render_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.proxy.agentgateway.service.name=llm-instance-gateway --set 'router.proxy.agentgateway.service.ports[0]=8000' --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=http' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=http' > ${agentgateway_render_output}" +agentgateway_render_command="${HELM} template ${SCRIPT_ROOT}/config/charts/llm-d-router-standalone --set router.proxy.proxyType=agentgateway --set router.modelServers.matchLabels.app=llm-instance-gateway --set router.inferencePool.create=false --set 'router.modelServers.targetPorts[0].number=8000' --set 'router.extraServicePorts[0].name=http' --set 'router.extraServicePorts[0].port=9000' --set 'router.extraServicePorts[0].protocol=TCP' --set 'router.extraServicePorts[0].targetPort=http' > ${agentgateway_render_output}" echo "Executing: ${agentgateway_render_command}" eval "${agentgateway_render_command}" if ! grep -q -- '--secure-serving=false' "${agentgateway_render_output}"; then @@ -239,21 +239,16 @@ if ! grep -q -- 'destinationMode: passthrough' "${agentgateway_render_output}"; echo "Agentgateway Helm template did not render passthrough destination mode" exit 1 fi - -agentgateway_service_block="${TEMP_DIR}/llm-d-router-standalone-agentgateway-service.yaml" -sed -n '/^# Source: llm-d-router-standalone\/templates\/agentgateway-service.yaml/,/^---/p' "${agentgateway_render_output}" > "${agentgateway_service_block}" -if ! grep -q -- 'app.kubernetes.io/component: agentgateway-model-service' "${agentgateway_service_block}"; then - echo "Agentgateway model Service did not render its component label" +if ! grep -q -- 'name: "default/llm-instance-gateway"' "${agentgateway_render_output}"; then + echo "Agentgateway Helm template did not derive the logical backend name from modelServers" exit 1 fi -agentgateway_selector_block="${TEMP_DIR}/llm-d-router-standalone-agentgateway-service-selector.yaml" -sed -n '/^ selector:$/,/^ ports:$/p' "${agentgateway_service_block}" > "${agentgateway_selector_block}" -if ! grep -Eq -- '^[[:space:]]+"?app"?:[[:space:]]+"?llm-instance-gateway"?[[:space:]]*$' "${agentgateway_selector_block}"; then - echo "Agentgateway model Service did not render selector labels from router.modelServers.matchLabels" +if ! grep -q -- 'hostname: "llm-instance-gateway"' "${agentgateway_render_output}"; then + echo "Agentgateway Helm template did not derive the logical backend hostname from modelServers" exit 1 fi -if grep -q -- 'app.kubernetes.io/name:' "${agentgateway_service_block}"; then - echo "Agentgateway model Service rendered an app.kubernetes.io/name label" +if grep -q -- '# Source: llm-d-router-standalone/templates/agentgateway-service.yaml' "${agentgateway_render_output}"; then + echo "Agentgateway model Service unexpectedly rendered" exit 1 fi diff --git a/pkg/sidecar/proxy/allowlist.go b/pkg/sidecar/proxy/allowlist.go index 33572f4a21..6bf3323bb6 100644 --- a/pkg/sidecar/proxy/allowlist.go +++ b/pkg/sidecar/proxy/allowlist.go @@ -250,27 +250,34 @@ func (av *AllowlistValidator) onInferencePoolDelete(obj interface{}) { func (av *AllowlistValidator) updatePodsForPool(poolObj *unstructured.Unstructured) { poolName := poolObj.GetName() - // Parse the pool spec to get selector - spec, found, err := unstructured.NestedMap(poolObj.Object, "spec") - if err != nil || !found { - av.logger.Error(err, "InferencePool missing or invalid spec field", "name", poolName, "found", found) + selector, err := av.poolSelector(poolObj) + if err != nil { + av.logger.Error(err, "failed to extract selector from InferencePool", "name", poolName) return } - selectorData, found, err := unstructured.NestedMap(spec, "selector") + av.createPodInformer(poolName, selector) +} + +func (av *AllowlistValidator) poolSelector(poolObj *unstructured.Unstructured) (labels.Selector, error) { + spec, found, err := unstructured.NestedMap(poolObj.Object, "spec") if err != nil || !found { - av.logger.Error(err, "InferencePool missing or invalid selector field", "name", poolName, "found", found) - return + return nil, fmt.Errorf("missing or invalid spec field (found=%t): %w", found, err) } - // Convert to labels.Selector - labelSelector := labels.Set{} - for k, v := range selectorData { - labelSelector[k] = fmt.Sprintf("%v", v) + // GA API (inference.networking.k8s.io) uses spec.selector.matchLabels; + // deprecated alpha API (inference.networking.x-k8s.io) uses a flat spec.selector map. + selectorPath := []string{"selector", "matchLabels"} + if av.gvr.Group != routing.InferencePoolAPIGroup { + selectorPath = []string{"selector"} + } + + selectorData, found, err := unstructured.NestedStringMap(spec, selectorPath...) + if err != nil || !found { + return nil, fmt.Errorf("missing or invalid selector field at %v (found=%t): %w", selectorPath, found, err) } - // Create or update pod informer for this selector - av.createPodInformer(poolName, labelSelector.AsSelector()) + return labels.Set(selectorData).AsSelector(), nil } // createPodInformer creates a new pod informer for the given selector diff --git a/pkg/sidecar/proxy/allowlist_test.go b/pkg/sidecar/proxy/allowlist_test.go index b2d8eb567f..57f9cc1b1e 100644 --- a/pkg/sidecar/proxy/allowlist_test.go +++ b/pkg/sidecar/proxy/allowlist_test.go @@ -21,6 +21,8 @@ import ( . "github.com/onsi/gomega" // nolint:revive "github.com/llm-d/llm-d-router/pkg/common/routing" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/utils/set" ) @@ -41,6 +43,139 @@ var _ = Describe("AllowlistValidator", func() { }) }) + Context("poolSelector", func() { + It("should extract selector from GA InferencePool (matchLabels)", func() { + av := &AllowlistValidator{ + gvr: schema.GroupVersionResource{ + Group: routing.InferencePoolAPIGroup, + Version: "v1", + Resource: "inferencepools", + }, + } + pool := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "inference.networking.k8s.io/v1", + "kind": "InferencePool", + "metadata": map[string]interface{}{"name": "test-pool"}, + "spec": map[string]interface{}{ + "selector": map[string]interface{}{ + "matchLabels": map[string]interface{}{ + "app.kubernetes.io/name": "my-model", + "component": "serving", + }, + }, + }, + }, + } + + selector, err := av.poolSelector(pool) + Expect(err).ToNot(HaveOccurred()) + Expect(selector.String()).To(SatisfyAll( + ContainSubstring("app.kubernetes.io/name=my-model"), + ContainSubstring("component=serving"), + )) + }) + + It("should extract selector from deprecated alpha InferencePool (flat map)", func() { + av := &AllowlistValidator{ + gvr: schema.GroupVersionResource{ + Group: "inference.networking.x-k8s.io", + Version: "v1alpha2", + Resource: "inferencepools", + }, + } + pool := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "inference.networking.x-k8s.io/v1alpha2", + "kind": "InferencePool", + "metadata": map[string]interface{}{"name": "test-pool"}, + "spec": map[string]interface{}{ + "selector": map[string]interface{}{ + "app.kubernetes.io/name": "my-model", + "component": "serving", + }, + }, + }, + } + + selector, err := av.poolSelector(pool) + Expect(err).ToNot(HaveOccurred()) + Expect(selector.String()).To(SatisfyAll( + ContainSubstring("app.kubernetes.io/name=my-model"), + ContainSubstring("component=serving"), + )) + }) + + It("should fail for GA pool with flat selector (no matchLabels)", func() { + av := &AllowlistValidator{ + gvr: schema.GroupVersionResource{ + Group: routing.InferencePoolAPIGroup, + Version: "v1", + Resource: "inferencepools", + }, + } + pool := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "inference.networking.k8s.io/v1", + "kind": "InferencePool", + "metadata": map[string]interface{}{"name": "test-pool"}, + "spec": map[string]interface{}{ + "selector": map[string]interface{}{ + "app": "my-model", + }, + }, + }, + } + + _, err := av.poolSelector(pool) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("matchLabels")) + }) + + It("should fail when spec is missing", func() { + av := &AllowlistValidator{ + gvr: schema.GroupVersionResource{ + Group: routing.InferencePoolAPIGroup, + Version: "v1", + Resource: "inferencepools", + }, + } + pool := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "inference.networking.k8s.io/v1", + "kind": "InferencePool", + "metadata": map[string]interface{}{"name": "test-pool"}, + }, + } + + _, err := av.poolSelector(pool) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("spec")) + }) + + It("should fail when selector is missing", func() { + av := &AllowlistValidator{ + gvr: schema.GroupVersionResource{ + Group: "inference.networking.x-k8s.io", + Version: "v1alpha2", + Resource: "inferencepools", + }, + } + pool := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "inference.networking.x-k8s.io/v1alpha2", + "kind": "InferencePool", + "metadata": map[string]interface{}{"name": "test-pool"}, + "spec": map[string]interface{}{}, + }, + } + + _, err := av.poolSelector(pool) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("selector")) + }) + }) + Context("when SSRF protection is enabled", func() { var validator *AllowlistValidator diff --git a/pkg/sidecar/proxy/proxy.go b/pkg/sidecar/proxy/proxy.go index bdfc614526..dd00b84fc2 100644 --- a/pkg/sidecar/proxy/proxy.go +++ b/pkg/sidecar/proxy/proxy.go @@ -31,6 +31,7 @@ import ( "github.com/go-logr/logr" lru "github.com/hashicorp/golang-lru/v2" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "golang.org/x/sync/errgroup" "sigs.k8s.io/controller-runtime/pkg/log" @@ -400,10 +401,12 @@ func (s *Server) Clone() *Server { } } -// newProxyTransport returns an http.Transport cloned from the default with -// connection-pool settings applied. If scheme is schemeHTTPS the transport's -// TLSClientConfig is set accordingly. -func (s *Server) newProxyTransport(scheme string, insecureSkipVerify bool) *http.Transport { +// newProxyTransport returns an http.RoundTripper backed by an http.Transport +// cloned from the default with connection-pool settings applied. If scheme is +// schemeHTTPS the transport's TLSClientConfig is set accordingly. The transport +// is wrapped with otelhttp so outbound requests carry W3C trace context, +// keeping EPP, routing-proxy, and vLLM spans in a single trace. +func (s *Server) newProxyTransport(scheme string, insecureSkipVerify bool) http.RoundTripper { maxIdle := s.config.MaxIdleConnsPerHost if maxIdle <= 0 { maxIdle = defaultMaxIdleConnsPerHost @@ -427,7 +430,7 @@ func (s *Server) newProxyTransport(scheme string, insecureSkipVerify bool) *http }, } } - return t + return otelhttp.NewTransport(t) } func (s *Server) setKVConnector() { diff --git a/pkg/sidecar/proxy/transport_test.go b/pkg/sidecar/proxy/transport_test.go new file mode 100644 index 0000000000..d55d5b9588 --- /dev/null +++ b/pkg/sidecar/proxy/transport_test.go @@ -0,0 +1,78 @@ +/* +Copyright 2025 The llm-d Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package proxy + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" +) + +// newProxyTransport must inject W3C trace context into outbound requests so that +// EPP -> routing-proxy -> vLLM share a single trace. +func TestNewProxyTransportInjectsTraceContext(t *testing.T) { + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) + + var gotTraceparent string + backend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotTraceparent = r.Header.Get("traceparent") + w.WriteHeader(http.StatusOK) + })) + defer backend.Close() + + s := NewProxy(Config{}) + client := &http.Client{Transport: s.newProxyTransport("http", false)} + + traceID, err := trace.TraceIDFromHex("0123456789abcdef0123456789abcdef") + if err != nil { + t.Fatalf("parse trace ID: %v", err) + } + spanID, err := trace.SpanIDFromHex("0123456789abcdef") + if err != nil { + t.Fatalf("parse span ID: %v", err) + } + sc := trace.NewSpanContext(trace.SpanContextConfig{ + TraceID: traceID, + SpanID: spanID, + TraceFlags: trace.FlagsSampled, + Remote: true, + }) + ctx := trace.ContextWithSpanContext(context.Background(), sc) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, backend.URL, nil) + if err != nil { + t.Fatalf("build request: %v", err) + } + resp, err := client.Do(req) + if err != nil { + t.Fatalf("do request: %v", err) + } + _ = resp.Body.Close() + + if gotTraceparent == "" { + t.Fatal("expected traceparent header to be injected into outbound request, got none") + } + if !strings.Contains(gotTraceparent, traceID.String()) { + t.Fatalf("expected outbound traceparent to carry trace ID %s, got %q", traceID, gotTraceparent) + } +} diff --git a/scripts/check-latest-tags.sh b/scripts/check-latest-tags.sh index a71cbb8bdf..dcaf173a65 100755 --- a/scripts/check-latest-tags.sh +++ b/scripts/check-latest-tags.sh @@ -35,7 +35,7 @@ set -o pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # Images under these llm-d-owned registries may use ':latest'. -OWNED_IMAGE_RE='image:[[:space:]]*"?(ghcr\.io|quay\.io)/llm-d/' +OWNED_IMAGE_RE='image:[[:space:]]*['\''"]?(ghcr\.io|quay\.io)/llm-d/' WARN_ONLY=false diff --git a/test/scripts/e2e-common.sh b/test/scripts/e2e-common.sh new file mode 100644 index 0000000000..91ab2d7854 --- /dev/null +++ b/test/scripts/e2e-common.sh @@ -0,0 +1,42 @@ +# Copyright 2026 The llm-d Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Shared helpers for the e2e runner scripts. Source, do not execute. + +# e2e_handle_interrupt deletes the named kind cluster on Ctrl-C and exits 130. +# An empty cluster name means there is nothing to delete (the caller did not +# create a cluster it owns). E2E_KEEP_CLUSTER_ON_FAILURE=true keeps the cluster. +e2e_handle_interrupt() { + local cluster="$1" + echo "Interrupted!" + if [ -n "${cluster}" ] && [ "${E2E_KEEP_CLUSTER_ON_FAILURE:-false}" != "true" ]; then + echo "Deleting kind cluster '${cluster}'" + kind delete cluster --name "${cluster}" 2>/dev/null || true + elif [ -n "${cluster}" ]; then + echo "Keeping kind cluster '${cluster}' (E2E_KEEP_CLUSTER_ON_FAILURE=true)" + fi + exit 130 # SIGINT (Ctrl+C) +} + +# run_ginkgo_suite runs the Ginkgo e2e suite in the given package directory, +# applying E2E_LABEL_FILTER when set. +run_ginkgo_suite() { + local pkg="$1" + if [ -n "${E2E_LABEL_FILTER:-}" ]; then + echo "Label filter: ${E2E_LABEL_FILTER}" + go test -v -timeout 45m "${pkg}" -ginkgo.v -ginkgo.fail-fast "-ginkgo.label-filter=${E2E_LABEL_FILTER}" + else + go test -v -timeout 45m "${pkg}" -ginkgo.v -ginkgo.fail-fast + fi +} diff --git a/test/scripts/run_e2e.sh b/test/scripts/run_e2e.sh deleted file mode 100755 index 8f4b1d9893..0000000000 --- a/test/scripts/run_e2e.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -cleanup() { - echo "Interrupted!" - if [ "${E2E_KEEP_CLUSTER_ON_FAILURE:-false}" = "true" ]; then - echo "Keeping kind cluster 'e2e-tests' (E2E_KEEP_CLUSTER_ON_FAILURE=true)" - else - echo "Deleting kind cluster 'e2e-tests'" - kind delete cluster --name e2e-tests 2>/dev/null || true - fi - exit 130 # SIGINT (Ctrl+C) -} - -# Set trap only for interruption signals -# Normally kind cluster cleanup is done by AfterSuite -trap cleanup INT TERM - -echo "Running end to end tests" - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [ -n "${E2E_LABEL_FILTER:-}" ]; then - echo "Label filter: ${E2E_LABEL_FILTER}" - go test -v -timeout 45m "${DIR}/../e2e/" -ginkgo.v -ginkgo.fail-fast "-ginkgo.label-filter=${E2E_LABEL_FILTER}" -else - go test -v -timeout 45m "${DIR}/../e2e/" -ginkgo.v -ginkgo.fail-fast -fi diff --git a/hack/test-e2e.sh b/test/scripts/test-e2e-gaie.sh similarity index 80% rename from hack/test-e2e.sh rename to test/scripts/test-e2e-gaie.sh index 0c3de5bc85..d9720b4114 100755 --- a/hack/test-e2e.sh +++ b/test/scripts/test-e2e-gaie.sh @@ -18,9 +18,12 @@ set -euox pipefail DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# shellcheck source=test/scripts/e2e-common.sh +source "${DIR}/e2e-common.sh" + EPP_IMAGE="${EPP_IMAGE:-ghcr.io/llm-d/llm-d-router-endpoint-picker:dev}" SIM_IMAGE="${VLLM_IMAGE:-ghcr.io/llm-d/llm-d-inference-sim:v0.9.2}" -MANIFEST_PATH="${MANIFEST_PATH:-${DIR}/../test/testdata/sim-deployment.yaml}" +MANIFEST_PATH="${MANIFEST_PATH:-${DIR}/../testdata/sim-deployment.yaml}" USE_KIND="${USE_KIND:-true}" KIND_NODE_IMAGE="${KIND_NODE_IMAGE:-mirror.gcr.io/kindest/node:v1.32.2}" @@ -48,18 +51,11 @@ load_images() { CLUSTER_NAME="${cluster}" ./scripts/load_image.sh "${EPP_IMAGE}" "${SIM_IMAGE}" } -cleanup() { - echo "Interrupted!" - if [ -n "${CREATED_CLUSTER}" ] && [ "${E2E_KEEP_CLUSTER_ON_FAILURE:-false}" != "true" ]; then - echo "Deleting kind cluster '${CREATED_CLUSTER}'" - kind delete cluster --name "${CREATED_CLUSTER}" 2>/dev/null || true - fi - exit 130 # SIGINT (Ctrl+C) -} - # Normally kind cluster cleanup is done by AfterSuite; this trap only fires on # interruption signals so that a Ctrl+C still cleans up the cluster we created. -trap cleanup INT TERM +# CREATED_CLUSTER is empty until we create a cluster ourselves, so an interrupt +# before then deletes nothing. +trap 'e2e_handle_interrupt "${CREATED_CLUSTER}"' INT TERM if [ "${USE_KIND}" = "true" ]; then install_kind @@ -92,11 +88,5 @@ else fi echo "Running Go e2e tests in ./test/e2e/epp/..." -if [ -n "${E2E_LABEL_FILTER:-}" ]; then - echo "Label filter: ${E2E_LABEL_FILTER}" - MANIFEST_PATH="${MANIFEST_PATH}" E2E_IMAGE="${EPP_IMAGE}" \ - go test "${DIR}/../test/e2e/epp/" -v -timeout 45m -ginkgo.v -ginkgo.fail-fast "-ginkgo.label-filter=${E2E_LABEL_FILTER}" -else - MANIFEST_PATH="${MANIFEST_PATH}" E2E_IMAGE="${EPP_IMAGE}" \ - go test "${DIR}/../test/e2e/epp/" -v -timeout 45m -ginkgo.v -ginkgo.fail-fast -fi +export MANIFEST_PATH E2E_IMAGE="${EPP_IMAGE}" +run_ginkgo_suite "${DIR}/../e2e/epp/" diff --git a/test/scripts/test-e2e-router.sh b/test/scripts/test-e2e-router.sh new file mode 100755 index 0000000000..491cbdda3e --- /dev/null +++ b/test/scripts/test-e2e-router.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2026 The llm-d Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# shellcheck source=test/scripts/e2e-common.sh +source "${DIR}/e2e-common.sh" + +# Set trap only for interruption signals. +# Normally kind cluster cleanup is done by AfterSuite; this trap deletes the +# e2e-tests cluster on Ctrl-C. The delete is unconditional and only meaningful +# when the suite created that cluster (K8S_CONTEXT unset); when running against +# an existing context it is a no-op unless a cluster named e2e-tests happens to +# exist. +trap 'e2e_handle_interrupt "e2e-tests"' INT TERM + +echo "Running end to end tests" + +run_ginkgo_suite "${DIR}/../e2e/"