diff --git a/README.md b/README.md index 0e4b2eb..baee048 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,6 @@ namespaces and are shared by every model deployment: | BBR (Body-Based Router) | `istio-system` | `BBR_VERSION` (v1.3.1) | helm | Installed in Istio's rootNamespace so its EnvoyFilter applies cluster-wide; injects `X-Gateway-Model-Name`. | | `llm-gateway-auth` ([`kaito-project/llm-gateway-auth`](https://github.com/kaito-project/llm-gateway-auth)) | `llm-gateway-auth` | `LLM_GATEWAY_AUTH_VERSION` | helm | API-key ext_authz for the `inference-gateway`. Installs the `APIKey` CRD, the `apikey-operator` (reconciles `APIKey` → per-namespace Secret), and the `apikey-authz` ext_authz dataplane wired into Istio via `MeshConfig` + `AuthorizationPolicy`. | | KEDA + KEDA Kaito Scaler ([`kaito-project/keda-kaito-scaler`](https://github.com/kaito-project/keda-kaito-scaler), optional) | `keda` | `KEDA_VERSION` (v2.19.0), `KEDA_KAITO_SCALER_VERSION` (v0.4.1) | helm | Workload-metric autoscaling. | -| `model-not-found` (Deployment + ConfigMap + Service) | `default` | repo `HEAD` ([`hack/e2e/manifests/model-not-found.yaml`](hack/e2e/manifests/model-not-found.yaml)) | kubectl | Cluster-shared nginx-backed Service that returns OpenAI-compatible `404 model_not_found` JSON. Referenced cross-namespace by every workload namespace's catch-all `HTTPRoute` (authorised via a `ReferenceGrant` rendered by `charts/modelharness`). | ### Step 2. modelharness (one-time per workload namespace) diff --git a/charts/modelharness/templates/envoyfilter-not-found.yaml b/charts/modelharness/templates/envoyfilter-not-found.yaml new file mode 100644 index 0000000..f00b83f --- /dev/null +++ b/charts/modelharness/templates/envoyfilter-not-found.yaml @@ -0,0 +1,62 @@ +{{/* +Catch-all "model not found" responder, implemented as an Envoy +direct_response on the per-namespace Gateway. Replaces the previous +HTTPRoute → cluster-shared `model-not-found` Service design. + +Why an EnvoyFilter direct_response instead of a backed HTTPRoute: + - Zero backend Pod / Service / cross-namespace ReferenceGrant. + - Response body is generated by Envoy itself (no extra hop). + +Why a catch-all is REQUIRED (and not just a UX nicety): + Istio's CUSTOM AuthorizationPolicy is implemented as a paired + `envoy.filters.http.rbac` (shadow) + `envoy.filters.http.ext_authz` + filter — ext_authz is gated on metadata that the RBAC shadow filter + writes during decodeHeaders. When Envoy's router fails to match any + HTTPRoute it returns a local 404 BEFORE the RBAC shadow has finished + evaluating + writing that metadata, which means ext_authz is never + invoked and unknown-model requests SILENTLY BYPASS API-key auth. + Keeping a catch-all route that always matches preserves the full + filter-chain run and ensures auth runs on every request, regardless + of model name. Removing this template re-opens that bypass. + +The patch is anchored to BBR's filter name as a `subFilter` so it +attaches to the same HCM that `install_bbr` injects BBR into. The +`workloadSelector` scopes it to this namespace's Gateway pod only. +*/}} +apiVersion: networking.istio.io/v1alpha3 +kind: EnvoyFilter +metadata: + name: model-not-found-direct + namespace: {{ include "modelharness.namespace" . }} + labels: + {{- include "modelharness.labels" . | nindent 4 }} +spec: + workloadSelector: + labels: + gateway.networking.k8s.io/gateway-name: {{ include "modelharness.gatewayName" . | quote }} + configPatches: + - applyTo: VIRTUAL_HOST + match: + context: GATEWAY + routeConfiguration: + vhost: + name: "" + patch: + operation: MERGE + value: + routes: + # Appended last; deployment-specific HTTPRoute matches on + # X-Gateway-Model-Name win first, this rule catches the rest. + - name: model-not-found-fallback + match: + prefix: / + direct_response: + status: 404 + body: + inline_string: | + {"error":{"message":"The model does not exist.","type":"invalid_request_error","param":"model","code":"model_not_found"}} + response_headers_to_add: + - header: + key: content-type + value: application/json + append_action: OVERWRITE_IF_EXISTS_OR_ADD diff --git a/charts/modelharness/templates/httproute-not-found.yaml b/charts/modelharness/templates/httproute-not-found.yaml deleted file mode 100644 index 58c8cf1..0000000 --- a/charts/modelharness/templates/httproute-not-found.yaml +++ /dev/null @@ -1,34 +0,0 @@ -{{/* -Catch-all HTTPRoute. Routes any request whose model name did not match -a deployment-specific HTTPRoute (rendered by charts/modeldeployment) to -the cluster-shared model-not-found Service so unknown models receive -an OpenAI-compatible 404 JSON body instead of Envoy's bare 404. - -The Service lives in `.Values.modelNotFound.namespace` (typically -`default`) and is shared by every workload namespace. The cross-namespace -backendRef is authorised by the ReferenceGrant rendered alongside this -route (see referencegrant.yaml). -*/}} -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: model-not-found-route - namespace: {{ include "modelharness.namespace" . }} - labels: - {{- include "modelharness.labels" . | nindent 4 }} -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: {{ include "modelharness.gatewayName" . | quote }} - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - group: "" - kind: Service - name: {{ .Values.modelNotFound.serviceName }} - namespace: {{ .Values.modelNotFound.namespace }} - port: 80 diff --git a/charts/modelharness/templates/referencegrant.yaml b/charts/modelharness/templates/referencegrant.yaml deleted file mode 100644 index d818511..0000000 --- a/charts/modelharness/templates/referencegrant.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{{/* -ReferenceGrant in the model-not-found Service's namespace (default by -convention) authorising this workload namespace's catch-all HTTPRoute -to reference `/` -across namespaces. Named after the consuming namespace so each -workload namespace fully owns its grant; deleted automatically by -`helm uninstall modelharness`. -*/}} -apiVersion: gateway.networking.k8s.io/v1beta1 -kind: ReferenceGrant -metadata: - name: allow-model-not-found-from-{{ include "modelharness.namespace" . }} - namespace: {{ .Values.modelNotFound.namespace }} - labels: - {{- include "modelharness.labels" . | nindent 4 }} -spec: - from: - - group: gateway.networking.k8s.io - kind: HTTPRoute - namespace: {{ include "modelharness.namespace" . }} - to: - - group: "" - kind: Service - name: {{ .Values.modelNotFound.serviceName }} diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml index 31acccc..54c099c 100644 --- a/charts/modelharness/values.yaml +++ b/charts/modelharness/values.yaml @@ -19,15 +19,13 @@ gatewayName: "" # gatewayPort is the HTTP listener port on the Gateway. gatewayPort: 80 -# modelNotFound configures the cross-namespace reference to the -# cluster-shared model-not-found Service that the catch-all HTTPRoute -# forwards unmatched requests to. The Service itself is installed once -# per cluster in `modelNotFound.namespace` (typically `default`) by the -# E2E install script — this chart only renders the catch-all HTTPRoute -# and the ReferenceGrant authorising the cross-namespace backendRef. -modelNotFound: - namespace: "default" - serviceName: "model-not-found" +# Catch-all "model not found" responses are now produced by an Envoy +# direct_response patched onto the Gateway's HCM via the +# `model-not-found-direct` EnvoyFilter (see +# templates/envoyfilter-not-found.yaml). No backend Pod / Service / +# ReferenceGrant is required, so the previous `modelNotFound` config +# (which pointed at a cluster-shared `default/model-not-found` Service) +# has been removed. # auth toggles the per-namespace API-key authentication artifacts. When # enabled, the chart renders: diff --git a/hack/e2e/manifests/model-not-found.yaml b/hack/e2e/manifests/model-not-found.yaml deleted file mode 100644 index 99aecfd..0000000 --- a/hack/e2e/manifests/model-not-found.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Cluster-shared model-not-found Service. -# -# Returns an OpenAI-compatible 404 JSON body (`code: model_not_found`) -# instead of Envoy's bare 404. Installed once per cluster in `default` -# by hack/e2e/scripts/install-components.sh and consumed by every -# workload namespace's catch-all HTTPRoute via a ReferenceGrant -# rendered by charts/modelharness. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: model-not-found - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - app: model-not-found - template: - metadata: - labels: - app: model-not-found - spec: - containers: - - name: nginx - image: nginx:alpine - volumeMounts: - - name: conf - mountPath: /etc/nginx/conf.d - volumes: - - name: conf - configMap: - name: model-not-found-conf ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: model-not-found-conf - namespace: default -data: - default.conf: | - server { - listen 80; - location / { - default_type application/json; - return 404 '{"error":{"message":"The model specified in the request does not exist. Please check the model name and try again.","type":"invalid_request_error","code":"model_not_found"}}'; - } - } ---- -apiVersion: v1 -kind: Service -metadata: - name: model-not-found - namespace: default -spec: - selector: - app: model-not-found - ports: - - port: 80 diff --git a/hack/e2e/scripts/install-components.sh b/hack/e2e/scripts/install-components.sh index 0df5920..96e1b2c 100755 --- a/hack/e2e/scripts/install-components.sh +++ b/hack/e2e/scripts/install-components.sh @@ -21,9 +21,11 @@ # CRD is not yet served, so kubelet retries # until KAITO finishes installing it) # - BBR chart prefetch (git clone fork repo only) -# - Cluster-shared model-not-found Service in `default` (consumed by -# every workload namespace's catch-all HTTPRoute via a -# ReferenceGrant rendered by charts/modelharness). +# +# (Catch-all 404 handling is now provided by an EnvoyFilter +# direct_response rendered per-namespace by charts/modelharness — no +# cluster-shared Service is required, so install_model_not_found has +# been removed from this script.) # # Phase 2 (parallel, depends on Phase 1): # - Istio (after Gateway API CRDs) @@ -52,7 +54,6 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -MANIFESTS_DIR="${SCRIPT_DIR}/../manifests" # Validate required version variables are set. : "${ISTIO_VERSION:?ISTIO_VERSION is not set. Source versions.env or export it before calling this script.}" @@ -265,8 +266,19 @@ install_gateway_api_crds() { } install_gwie_crds() { + # Use server-side apply (--server-side --force-conflicts) instead of the + # default client-side apply. install_gwie_crds runs in parallel with + # install_kaito in phase1-base, and the KAITO chart bundles the same + # GWIE CRDs (inferencepools / inferenceobjectives in both + # inference.networking.k8s.io and inference.networking.x-k8s.io groups). + # Client-side apply does GET → CREATE-if-missing, which races with KAITO + # creating the CRD between the GET and the CREATE and fails with + # `AlreadyExists`. Server-side apply is a single atomic POST with a + # field manager: if the object already exists it is merged in place + # (with --force-conflicts taking ownership of any fields KAITO set). echo "=== Installing GWIE CRDs ===" - kubectl apply -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml" + kubectl apply --server-side --force-conflicts \ + -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml" } install_keda() { @@ -430,18 +442,6 @@ install_llm_gateway_auth() { kubectl -n llm-gateway-auth rollout status deployment/apikey-authz --timeout=180s || true } -install_model_not_found() { - # Cluster-shared catch-all 404 Service in `default`. Every workload - # namespace's modelharness release renders a catch-all HTTPRoute that - # forwards unmatched requests to this Service across namespaces, - # authorised by a per-namespace ReferenceGrant. - echo "=== Deploying cluster-shared model-not-found Service in default ===" - kubectl apply -f "${MANIFESTS_DIR}/model-not-found.yaml" - - echo "⏳ Waiting for model-not-found service..." - kubectl -n default rollout status deployment/model-not-found --timeout=120s || true -} - # ── Phased execution ────────────────────────────────────────────────────── # # Per-namespace shared resources (Gateway, catch-all HTTPRoute, @@ -456,8 +456,7 @@ run_phase phase1-base \ install_keda \ install_keda_kaito_scaler \ install_gpu_mocker \ - prefetch_bbr_chart \ - install_model_not_found + prefetch_bbr_chart run_phase phase2-istio \ install_istio diff --git a/hack/e2e/scripts/validate-components.sh b/hack/e2e/scripts/validate-components.sh index 2ecab86..57d2b6f 100755 --- a/hack/e2e/scripts/validate-components.sh +++ b/hack/e2e/scripts/validate-components.sh @@ -79,22 +79,9 @@ fi kubectl -n istio-system get pods -l app=body-based-router 2>/dev/null || true echo "" -# ── Cluster-shared model-not-found backend ────────────────────────────── -# After the modelharness refactor, per-namespace Istio Gateways -# ("-gw") are provisioned at test time by EnsureNamespace -# (charts/modelharness), so no `inference-gateway` Gateway pod exists in -# `default` to validate at install time. The only namespace-tier -# component install-components.sh still pre-installs is the -# cluster-shared 404 Service that every workload namespace's catch-all -# HTTPRoute references via a ReferenceGrant — validate that here. -echo "=== model-not-found (cluster-shared 404 backend) ===" -if kubectl -n default wait --for=condition=ready pod -l app=model-not-found --timeout="${TIMEOUT}" >/dev/null 2>&1; then - pass "model-not-found pod is Running" -else - fail "model-not-found pod is NOT Running" -fi -kubectl -n default get pods -l app=model-not-found 2>/dev/null || true -echo "" +# (Catch-all 404 handling is now produced by an EnvoyFilter +# direct_response rendered per-namespace by charts/modelharness — no +# cluster-shared Service exists to validate.) # ── KEDA ───────────────────────────────────────────────────────────────── echo "=== KEDA (namespace: ${KEDA_NAMESPACE}, provider: ${E2E_PROVIDER}) ===" diff --git a/test/e2e/README.md b/test/e2e/README.md index 7f5b660..5121cc4 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -15,7 +15,7 @@ Single source of truth: [`cases.go`](cases.go) → `CaseDeployments`. Each entry `Name` is unique cluster-wide and is the value matched by `X-Gateway-Model-Name` (i.e. the `model` field clients send in OpenAI-compatible requests). `Model` is the KAITO preset only — multiple deployments may share a preset under different `Name`s. -Inference tests target the case's **`caseGatewayURL`**. Each case namespace gets its own Gateway, catch-all `model-not-found` route, and (when enabled) API-key auth artifacts via the [`charts/modelharness`](../../charts/modelharness) chart installed by `EnsureNamespace`. +Inference tests target the case's **`caseGatewayURL`**. Each case namespace gets its own Gateway, catch-all `model-not-found-direct` EnvoyFilter (Envoy `direct_response` 404), and (when enabled) API-key auth artifacts via the [`charts/modelharness`](../../charts/modelharness) chart installed by `EnsureNamespace`. ## Helpers @@ -159,7 +159,7 @@ var GinkgoLabelMyFeature = ginkgo.Label("MyFeature") ### 5. Add per-namespace resources (rare) -If your case needs additional cluster-side resources beyond what the [`charts/modelharness`](../../charts/modelharness) chart already provisions (Gateway, catch-all `model-not-found` Service + HTTPRoute, optional `AuthorizationPolicy` + `APIKey`), add them as templates in `charts/modelharness` so every workload namespace picks them up consistently. +If your case needs additional cluster-side resources beyond what the [`charts/modelharness`](../../charts/modelharness) chart already provisions (Gateway, catch-all `model-not-found-direct` EnvoyFilter, optional `AuthorizationPolicy` + `APIKey`), add them as templates in `charts/modelharness` so every workload namespace picks them up consistently. ### 6. Validate diff --git a/test/e2e/gpu_mocker_test.go b/test/e2e/gpu_mocker_test.go index 322d2bb..3ff95a4 100644 --- a/test/e2e/gpu_mocker_test.go +++ b/test/e2e/gpu_mocker_test.go @@ -330,14 +330,14 @@ var _ = Describe("GPU Mocker E2E", Ordered, func() { Context("Non-existent model request", func() { It("should return 404 with an OpenAI-compatible error for an unknown model", func() { - // The catch-all model-not-found HTTPRoute is provisioned - // per-namespace by the modelharness chart (installed via - // EnsureNamespace) and forwards unmatched requests across - // namespaces to the cluster-shared `default/model-not-found` - // Service (authorised by a ReferenceGrant). The gpu-mocker - // case has AuthAPIKeyEnabled=false, so no - // AuthorizationPolicy is rendered and the probe needs no - // bearer token. + // The catch-all `model-not-found-direct` EnvoyFilter is + // provisioned per-namespace by the modelharness chart + // (installed via EnsureNamespace) and patches an Envoy + // `direct_response` (status 404 + OpenAI-compatible JSON) onto + // the Gateway's virtual host as a catch-all route. No backend + // Pod / Service is involved. The gpu-mocker case has + // AuthAPIKeyEnabled=false, so no AuthorizationPolicy is + // rendered and the probe needs no bearer token. resp, err := utils.SendChatCompletion(caseGatewayURL, "non-existent-model-xyz") Expect(err).NotTo(HaveOccurred()) Expect(resp.StatusCode).To(Equal(http.StatusNotFound)) diff --git a/test/e2e/model_routing_test.go b/test/e2e/model_routing_test.go index 718d453..cfc334f 100644 --- a/test/e2e/model_routing_test.go +++ b/test/e2e/model_routing_test.go @@ -28,7 +28,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" "github.com/kaito-project/production-stack/test/e2e/utils" ) @@ -48,7 +47,8 @@ import ( // - Istio Gateway with Body-Based Routing (BBR) configured // - At least two KAITO InferenceSets serving different models // - GPU node mocker creating shadow pods with llm-d-inference-sim -// - model-not-found catch-all HTTPRoute deployed +// - Catch-all `model-not-found-direct` EnvoyFilter (Envoy +// direct_response, rendered per-namespace by charts/modelharness) var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func() { // Per-case deployments owned by model_routing_test.go (see cases.go). @@ -73,8 +73,9 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func( var ctx context.Context // caseGatewayURL routes to this case's dedicated Gateway. Resolved - // in BeforeAll. Per-namespace catch-all model-not-found / Gateway - // resources are provisioned by the modelharness chart. + // in BeforeAll. Per-namespace Gateway + catch-all + // `model-not-found-direct` EnvoyFilter are provisioned by the + // modelharness chart. var caseGatewayURL string BeforeAll(func() { @@ -262,41 +263,26 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func( }) Context("Model-specific route wins over catch-all", func() { - It("should not route known model requests to the model-not-found service", func() { - clientset, err := utils.GetK8sClientset() - Expect(err).NotTo(HaveOccurred()) - - By("recording model-not-found pod request count before test") - // model-not-found is a cluster-shared Service installed once in - // utils.ModelNotFoundNamespace by - // hack/e2e/scripts/install-components.sh; every workload - // namespace's catch-all HTTPRoute references it via a - // ReferenceGrant rendered by charts/modelharness. - mnfPods, err := clientset.CoreV1().Pods(utils.ModelNotFoundNamespace).List(ctx, metav1.ListOptions{ - LabelSelector: utils.ModelNotFoundPodLabel, - FieldSelector: "status.phase=Running", - }) - Expect(err).NotTo(HaveOccurred()) - Expect(mnfPods.Items).NotTo(BeEmpty(), "model-not-found pods should be running") - - // Capture the nginx access log line count before traffic. - mnfPodName := mnfPods.Items[0].Name - beforeLogCount := countNginxAccessLogs(clientset, utils.ModelNotFoundNamespace, mnfPodName) - + // The catch-all is now an Envoy `direct_response` (status 404) + // patched onto the Gateway by the `model-not-found-direct` + // EnvoyFilter (charts/modelharness/templates/envoyfilter-not-found.yaml). + // If the catch-all ever hijacked a known-model request, the + // response would be 404 instead of 200, and the model field in + // the body would be missing. Assert both directly. + It("should not absorb known model requests into the catch-all 404", func() { By("sending requests to known models") for _, model := range modelNames { for i := 0; i < 3; i++ { resp, err := sendChat(caseGatewayURL, model) Expect(err).NotTo(HaveOccurred()) - Expect(resp.StatusCode).To(Equal(http.StatusOK)) - resp.Body.Close() + Expect(resp.StatusCode).To(Equal(http.StatusOK), + "known-model request to %s should not fall through to catch-all 404", model) + parsed, perr := utils.ParseChatCompletionResponse(resp) + Expect(perr).NotTo(HaveOccurred()) + Expect(parsed.Model).To(Equal(model), + "response for %s should echo the requested model name", model) } } - - By("verifying model-not-found service received no new requests") - afterLogCount := countNginxAccessLogs(clientset, utils.ModelNotFoundNamespace, mnfPodName) - Expect(afterLogCount).To(Equal(beforeLogCount), - "model-not-found service should not have received any requests for known models") }) }) @@ -663,24 +649,6 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func( }) }) -// countNginxAccessLogs counts POST request lines in the nginx access log. -// Only POST lines are counted to exclude Kubernetes health probe traffic -// (GET /) which would cause false positives. -func countNginxAccessLogs(clientset *kubernetes.Clientset, namespace, podName string) int { - logs, err := utils.GetPodLogs(clientset, namespace, podName, "nginx") - if err != nil { - return 0 - } - count := 0 - for _, line := range strings.Split(logs, "\n") { - line = strings.TrimSpace(line) - if line != "" && strings.Contains(line, "\"POST ") { - count++ - } - } - return count -} - // findDebugLogTriple searches istio-ingressgateway logs for a request-id that // has [PRE-BBR], [POST-EPP], and [RESPONSE] lines, where the POST-EPP line // contains the expected model name. Returns the three log lines. diff --git a/test/e2e/production-stack-E2E-test-scenarios.md b/test/e2e/production-stack-E2E-test-scenarios.md index 66cdce3..21a3bfe 100644 --- a/test/e2e/production-stack-E2E-test-scenarios.md +++ b/test/e2e/production-stack-E2E-test-scenarios.md @@ -144,7 +144,7 @@ the correct pool received the request. EPP-side counters corroborate the schedul * Correct model name for ministral — Same validation for the second model; catches per-pool misrouting. * Cross-model isolation (serial) — Send N requests for each model sequentially. Scrape `vllm:request_success_total{model_name}` from every shadow pod before and after. Verify that phi pods' counters only incremented for phi requests, and ministral pods' counters only incremented for ministral requests. No cross-pool contamination. * Cross-model isolation (concurrent) — Launch **interleaved concurrent** traffic: 20 in-flight phi + 20 in-flight ministral requests at the same time. Verify per-pod `vllm:request_success_total{model_name}` still shows zero cross-contamination. BBR and EPP are two chained ext_proc filters; serial tests cannot expose header-state leakage between concurrent requests within the same Envoy worker. -* Model-specific route wins over catch-all — While the catch-all `model-not-found` HTTPRoute is deployed (see *Unknown model handling*), requests with a known model name must never hit it. Verify by scraping the `model-not-found` Service's request counter / access log: it must stay at 0 during the above cross-model runs. Guards against HTTPRoute ordering regressions where the catch-all rule silently absorbs valid traffic. +* Model-specific route wins over catch-all — The catch-all `model-not-found-direct` EnvoyFilter (see *Unknown model handling*) patches an Envoy `direct_response` (status 404) onto the Gateway's virtual host. Requests with a known model name must never hit it: if they did, the response would be 404 instead of 200. Verify by sending requests for each known model and asserting `HTTP 200` plus a body whose `model` field matches the request. Guards against route-ordering regressions where the catch-all rule silently absorbs valid traffic. * EPP routing success (metrics) — After the above runs, verify `inference_extension_scheduler_attempts_total{status="success"}` increased by the total requests sent and `{status="failure"}` did not change. Verify `inference_objective_request_total{model_name="routing-phi"}` and `{model_name="routing-ministral"}` match the per-model counts. Proves EPP actively scheduled each request rather than falling through to a default route. * Load distribution — With 2 replicas per pool, send 20+ requests per model. Scrape `vllm:request_success_total` from each pod and verify no pod received 0 requests and none received more than 80% of its pool's traffic. Cross-check with `inference_pool_per_pod_queue_size{name, model_server_pod}` to confirm both pods were active. If one pod gets all traffic, EPP's scoring or endpoint list is broken. * Debug EnvoyFilter log chain — For **one** representative request in each of the cases above (phi, ministral, concurrent cross-model), tail istio-ingressgateway logs and verify the `inference-debug-filter` Lua chain emitted exactly one `[PRE-BBR]`, one `[POST-EPP]`, and one `[RESPONSE]` line sharing the same `x-request-id`. In the `[POST-EPP]` line, `x-gateway-model-name` equals the request's model field (proves BBR ran) and `x-gateway-destination-endpoint` is a non-empty `IP:port` matching the pod that actually served the request per `vllm:request_success_total` (proves EPP ran and its decision was honoured). Health-check `GET /` traffic must not produce these log lines. This folds the debug/observability surface into the main routing assertions so filter-chain ordering regressions (e.g., Istio upgrade) cannot silently break on-cluster debugging. @@ -216,7 +216,7 @@ individually so failures can be localised. Verifies the Gateway's client-compatibility contract for bad inputs: unknown models hit the catch-all JSON 404, and malformed bodies do not crash BBR or leak Envoy's raw error pages. -* 404 for unknown model — Request with `{"model": "does-not-exist", ...}` returns HTTP 404 with the OpenAI-compatible JSON body `{"error":{"code":"model_not_found", ...}}` served by the `model-not-found` Service — not Envoy's raw 404 HTML. +* 404 for unknown model — Request with `{"model": "does-not-exist", ...}` returns HTTP 404 with the OpenAI-compatible JSON body `{"error":{"code":"model_not_found", ...}}` produced by the per-namespace `model-not-found-direct` EnvoyFilter (Envoy `direct_response`) — not Envoy's raw 404 HTML. * Missing `model` field — Request body `{"messages": [...]}` with no `model` key. BBR cannot inject `x-gateway-model-name`; verify the request is handled predictably (routed to the catch-all 404 JSON, not a 500 or hang). * Non-string `model` field — Body with `{"model": 42, ...}`. Verify BBR rejects or falls through cleanly (catch-all 404 JSON), with no Envoy 5xx. * Non-JSON body on `/v1/*` — Send `text/plain` or truncated JSON to `POST /v1/chat/completions`. Verify response is a well-formed error (4xx), the BBR ext_proc filter does not crash (Envoy stays up, subsequent valid requests succeed), and no goroutine leak appears in BBR logs. diff --git a/test/e2e/utils/dynamic.go b/test/e2e/utils/dynamic.go index aaee21a..08f7971 100644 --- a/test/e2e/utils/dynamic.go +++ b/test/e2e/utils/dynamic.go @@ -57,23 +57,6 @@ var ( Kind: "Gateway", } - // HTTPRouteGVK identifies HTTPRoute resources (used to create the - // per-namespace catch-all that returns OpenAI-compatible 404 JSON). - HTTPRouteGVK = schema.GroupVersionKind{ - Group: "gateway.networking.k8s.io", - Version: "v1", - Kind: "HTTPRoute", - } - - // ReferenceGrantGVK identifies the ReferenceGrant used to permit a - // per-case HTTPRoute to reference the shared model-not-found Service - // living in the default namespace. - ReferenceGrantGVK = schema.GroupVersionKind{ - Group: "gateway.networking.k8s.io", - Version: "v1beta1", - Kind: "ReferenceGrant", - } - // AuthorizationPolicyGVK identifies the Istio AuthorizationPolicy used // to wire each per-case Gateway into the apikey-ext-authz CUSTOM // provider. The upstream llm-gateway-apikey chart only installs an AP diff --git a/test/e2e/utils/helm.go b/test/e2e/utils/helm.go index 4ea1c75..c223c28 100644 --- a/test/e2e/utils/helm.go +++ b/test/e2e/utils/helm.go @@ -190,8 +190,9 @@ func modelHarnessChartPath() string { // InstallModelHarness runs `helm upgrade --install` for the modelharness chart // in `namespace`. It provisions the per-namespace Gateway (named -// "-gw" by chart default), the catch-all model-not-found -// HTTPRoute + ReferenceGrant, and — when authEnabled is true — the +// "-gw" by chart default), the catch-all +// `model-not-found-direct` EnvoyFilter (Envoy `direct_response` returning +// 404 + OpenAI-compatible JSON), and — when authEnabled is true — the // per-namespace AuthorizationPolicy + APIKey CR. When // networkPolicyEnabled is true, the chart additionally renders the // default-deny-ingress / allow-inference-traffic NetworkPolicies that diff --git a/test/e2e/utils/http.go b/test/e2e/utils/http.go index 0d3c7de..eb0ce79 100644 --- a/test/e2e/utils/http.go +++ b/test/e2e/utils/http.go @@ -39,17 +39,6 @@ const ( // HTTPTimeout is the default timeout for HTTP requests. // Set high to account for BBR/EPP ext_proc startup latency. HTTPTimeout = 60 * time.Second - - // ModelNotFoundNamespace is the namespace that hosts the cluster-shared - // `model-not-found` Deployment + Service installed once per cluster by - // hack/e2e/scripts/install-components.sh. Every workload namespace's - // catch-all HTTPRoute (rendered by charts/modelharness) forwards - // unmatched requests here via a cross-namespace ReferenceGrant. - ModelNotFoundNamespace = "default" - - // ModelNotFoundPodLabel is the label selector used to list the - // model-not-found Pods (e.g. when scraping nginx access logs). - ModelNotFoundPodLabel = "app=model-not-found" ) // ChatCompletionRequest represents an OpenAI-compatible chat completion request body. diff --git a/test/e2e/utils/setup.go b/test/e2e/utils/setup.go index 6b5bfbb..65602c3 100644 --- a/test/e2e/utils/setup.go +++ b/test/e2e/utils/setup.go @@ -33,7 +33,9 @@ import ( // EnsureNamespace creates the namespace if it does not exist and installs // the modelharness Helm chart into it. modelharness owns every per-namespace // shared resource: the Istio Gateway (named "-gw" by chart -// default), the catch-all model-not-found HTTPRoute + ReferenceGrant, +// default), the catch-all `model-not-found-direct` EnvoyFilter (Envoy +// `direct_response` returning 404 + OpenAI-compatible JSON for any +// request not matched by a deployment-specific HTTPRoute), // — when authEnabled is true — the AuthorizationPolicy + APIKey CR // that wire the Gateway into the cluster-wide apikey-ext-authz CUSTOM // provider, and — when networkPolicyEnabled is true — the