diff --git a/README.md b/README.md
index 0e4b2eb..baee048 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,6 @@ namespaces and are shared by every model deployment:
 | BBR (Body-Based Router)              | `istio-system`   | `BBR_VERSION` (v1.3.1)                  | helm           | Installed in Istio's rootNamespace so its EnvoyFilter applies cluster-wide; injects `X-Gateway-Model-Name`. |
 | `llm-gateway-auth` ([`kaito-project/llm-gateway-auth`](https://github.com/kaito-project/llm-gateway-auth)) | `llm-gateway-auth` | `LLM_GATEWAY_AUTH_VERSION` | helm           | API-key ext_authz for the `inference-gateway`. Installs the `APIKey` CRD, the `apikey-operator` (reconciles `APIKey` → per-namespace Secret), and the `apikey-authz` ext_authz dataplane wired into Istio via `MeshConfig` + `AuthorizationPolicy`. |
 | KEDA + KEDA Kaito Scaler ([`kaito-project/keda-kaito-scaler`](https://github.com/kaito-project/keda-kaito-scaler), optional)  | `keda` | `KEDA_VERSION` (v2.19.0), `KEDA_KAITO_SCALER_VERSION` (v0.4.1) | helm | Workload-metric autoscaling.                                                    |
-| `model-not-found` (Deployment + ConfigMap + Service) | `default` | repo `HEAD` ([`hack/e2e/manifests/model-not-found.yaml`](hack/e2e/manifests/model-not-found.yaml)) | kubectl | Cluster-shared nginx-backed Service that returns OpenAI-compatible `404 model_not_found` JSON. Referenced cross-namespace by every workload namespace's catch-all `HTTPRoute` (authorised via a `ReferenceGrant` rendered by `charts/modelharness`). |
 
 ### Step 2. modelharness (one-time per workload namespace)
 
diff --git a/charts/modelharness/templates/envoyfilter-not-found.yaml b/charts/modelharness/templates/envoyfilter-not-found.yaml
new file mode 100644
index 0000000..f00b83f
--- /dev/null
+++ b/charts/modelharness/templates/envoyfilter-not-found.yaml
@@ -0,0 +1,62 @@
+{{/*
+Catch-all "model not found" responder, implemented as an Envoy
+direct_response on the per-namespace Gateway. Replaces the previous
+HTTPRoute → cluster-shared `model-not-found` Service design.
+
+Why an EnvoyFilter direct_response instead of a backed HTTPRoute:
+  - Zero backend Pod / Service / cross-namespace ReferenceGrant.
+  - Response body is generated by Envoy itself (no extra hop).
+
+Why a catch-all is REQUIRED (and not just a UX nicety):
+  Istio's CUSTOM AuthorizationPolicy is implemented as a paired
+  `envoy.filters.http.rbac` (shadow) + `envoy.filters.http.ext_authz`
+  filter — ext_authz is gated on metadata that the RBAC shadow filter
+  writes during decodeHeaders. When Envoy's router fails to match any
+  HTTPRoute it returns a local 404 BEFORE the RBAC shadow has finished
+  evaluating + writing that metadata, which means ext_authz is never
+  invoked and unknown-model requests SILENTLY BYPASS API-key auth.
+  Keeping a catch-all route that always matches preserves the full
+  filter-chain run and ensures auth runs on every request, regardless
+  of model name. Removing this template re-opens that bypass.
+
+The patch is anchored to BBR's filter name as a `subFilter` so it
+attaches to the same HCM that `install_bbr` injects BBR into. The
+`workloadSelector` scopes it to this namespace's Gateway pod only.
+*/}}
+apiVersion: networking.istio.io/v1alpha3
+kind: EnvoyFilter
+metadata:
+  name: model-not-found-direct
+  namespace: {{ include "modelharness.namespace" . }}
+  labels:
+    {{- include "modelharness.labels" . | nindent 4 }}
+spec:
+  workloadSelector:
+    labels:
+      gateway.networking.k8s.io/gateway-name: {{ include "modelharness.gatewayName" . | quote }}
+  configPatches:
+    - applyTo: VIRTUAL_HOST
+      match:
+        context: GATEWAY
+        routeConfiguration:
+          vhost:
+            name: ""
+      patch:
+        operation: MERGE
+        value:
+          routes:
+            # Appended last; deployment-specific HTTPRoute matches on
+            # X-Gateway-Model-Name win first, this rule catches the rest.
+            - name: model-not-found-fallback
+              match:
+                prefix: /
+              direct_response:
+                status: 404
+                body:
+                  inline_string: |
+                    {"error":{"message":"The model does not exist.","type":"invalid_request_error","param":"model","code":"model_not_found"}}
+              response_headers_to_add:
+                - header:
+                    key: content-type
+                    value: application/json
+                  append_action: OVERWRITE_IF_EXISTS_OR_ADD
diff --git a/charts/modelharness/templates/httproute-not-found.yaml b/charts/modelharness/templates/httproute-not-found.yaml
deleted file mode 100644
index 58c8cf1..0000000
--- a/charts/modelharness/templates/httproute-not-found.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-{{/*
-Catch-all HTTPRoute. Routes any request whose model name did not match
-a deployment-specific HTTPRoute (rendered by charts/modeldeployment) to
-the cluster-shared model-not-found Service so unknown models receive
-an OpenAI-compatible 404 JSON body instead of Envoy's bare 404.
-
-The Service lives in `.Values.modelNotFound.namespace` (typically
-`default`) and is shared by every workload namespace. The cross-namespace
-backendRef is authorised by the ReferenceGrant rendered alongside this
-route (see referencegrant.yaml).
-*/}}
-apiVersion: gateway.networking.k8s.io/v1
-kind: HTTPRoute
-metadata:
-  name: model-not-found-route
-  namespace: {{ include "modelharness.namespace" . }}
-  labels:
-    {{- include "modelharness.labels" . | nindent 4 }}
-spec:
-  parentRefs:
-    - group: gateway.networking.k8s.io
-      kind: Gateway
-      name: {{ include "modelharness.gatewayName" . | quote }}
-  rules:
-    - matches:
-        - path:
-            type: PathPrefix
-            value: /
-      backendRefs:
-        - group: ""
-          kind: Service
-          name: {{ .Values.modelNotFound.serviceName }}
-          namespace: {{ .Values.modelNotFound.namespace }}
-          port: 80
diff --git a/charts/modelharness/templates/referencegrant.yaml b/charts/modelharness/templates/referencegrant.yaml
deleted file mode 100644
index d818511..0000000
--- a/charts/modelharness/templates/referencegrant.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-{{/*
-ReferenceGrant in the model-not-found Service's namespace (default by
-convention) authorising this workload namespace's catch-all HTTPRoute
-to reference `<modelNotFound.namespace>/<modelNotFound.serviceName>`
-across namespaces. Named after the consuming namespace so each
-workload namespace fully owns its grant; deleted automatically by
-`helm uninstall modelharness`.
-*/}}
-apiVersion: gateway.networking.k8s.io/v1beta1
-kind: ReferenceGrant
-metadata:
-  name: allow-model-not-found-from-{{ include "modelharness.namespace" . }}
-  namespace: {{ .Values.modelNotFound.namespace }}
-  labels:
-    {{- include "modelharness.labels" . | nindent 4 }}
-spec:
-  from:
-    - group: gateway.networking.k8s.io
-      kind: HTTPRoute
-      namespace: {{ include "modelharness.namespace" . }}
-  to:
-    - group: ""
-      kind: Service
-      name: {{ .Values.modelNotFound.serviceName }}
diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml
index 31acccc..54c099c 100644
--- a/charts/modelharness/values.yaml
+++ b/charts/modelharness/values.yaml
@@ -19,15 +19,13 @@ gatewayName: ""
 # gatewayPort is the HTTP listener port on the Gateway.
 gatewayPort: 80
 
-# modelNotFound configures the cross-namespace reference to the
-# cluster-shared model-not-found Service that the catch-all HTTPRoute
-# forwards unmatched requests to. The Service itself is installed once
-# per cluster in `modelNotFound.namespace` (typically `default`) by the
-# E2E install script — this chart only renders the catch-all HTTPRoute
-# and the ReferenceGrant authorising the cross-namespace backendRef.
-modelNotFound:
-  namespace: "default"
-  serviceName: "model-not-found"
+# Catch-all "model not found" responses are now produced by an Envoy
+# direct_response patched onto the Gateway's HCM via the
+# `model-not-found-direct` EnvoyFilter (see
+# templates/envoyfilter-not-found.yaml). No backend Pod / Service /
+# ReferenceGrant is required, so the previous `modelNotFound` config
+# (which pointed at a cluster-shared `default/model-not-found` Service)
+# has been removed.
 
 # auth toggles the per-namespace API-key authentication artifacts. When
 # enabled, the chart renders:
diff --git a/hack/e2e/manifests/model-not-found.yaml b/hack/e2e/manifests/model-not-found.yaml
deleted file mode 100644
index 99aecfd..0000000
--- a/hack/e2e/manifests/model-not-found.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-# Cluster-shared model-not-found Service.
-#
-# Returns an OpenAI-compatible 404 JSON body (`code: model_not_found`)
-# instead of Envoy's bare 404. Installed once per cluster in `default`
-# by hack/e2e/scripts/install-components.sh and consumed by every
-# workload namespace's catch-all HTTPRoute via a ReferenceGrant
-# rendered by charts/modelharness.
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: model-not-found
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: model-not-found
-  template:
-    metadata:
-      labels:
-        app: model-not-found
-    spec:
-      containers:
-      - name: nginx
-        image: nginx:alpine
-        volumeMounts:
-        - name: conf
-          mountPath: /etc/nginx/conf.d
-      volumes:
-      - name: conf
-        configMap:
-          name: model-not-found-conf
----
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: model-not-found-conf
-  namespace: default
-data:
-  default.conf: |
-    server {
-      listen 80;
-      location / {
-        default_type application/json;
-        return 404 '{"error":{"message":"The model specified in the request does not exist. Please check the model name and try again.","type":"invalid_request_error","code":"model_not_found"}}';
-      }
-    }
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: model-not-found
-  namespace: default
-spec:
-  selector:
-    app: model-not-found
-  ports:
-  - port: 80
diff --git a/hack/e2e/scripts/install-components.sh b/hack/e2e/scripts/install-components.sh
index 0df5920..96e1b2c 100755
--- a/hack/e2e/scripts/install-components.sh
+++ b/hack/e2e/scripts/install-components.sh
@@ -21,9 +21,11 @@
 #                                 CRD is not yet served, so kubelet retries
 #                                 until KAITO finishes installing it)
 #   - BBR chart prefetch (git clone fork repo only)
-#   - Cluster-shared model-not-found Service in `default` (consumed by
-#     every workload namespace's catch-all HTTPRoute via a
-#     ReferenceGrant rendered by charts/modelharness).
+#
+# (Catch-all 404 handling is now provided by an EnvoyFilter
+# direct_response rendered per-namespace by charts/modelharness — no
+# cluster-shared Service is required, so install_model_not_found has
+# been removed from this script.)
 #
 # Phase 2 (parallel, depends on Phase 1):
 #   - Istio                      (after Gateway API CRDs)
@@ -52,7 +54,6 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-MANIFESTS_DIR="${SCRIPT_DIR}/../manifests"
 
 # Validate required version variables are set.
 : "${ISTIO_VERSION:?ISTIO_VERSION is not set. Source versions.env or export it before calling this script.}"
@@ -265,8 +266,19 @@ install_gateway_api_crds() {
 }
 
 install_gwie_crds() {
+  # Use server-side apply (--server-side --force-conflicts) instead of the
+  # default client-side apply. install_gwie_crds runs in parallel with
+  # install_kaito in phase1-base, and the KAITO chart bundles the same
+  # GWIE CRDs (inferencepools / inferenceobjectives in both
+  # inference.networking.k8s.io and inference.networking.x-k8s.io groups).
+  # Client-side apply does GET → CREATE-if-missing, which races with KAITO
+  # creating the CRD between the GET and the CREATE and fails with
+  # `AlreadyExists`. Server-side apply is a single atomic POST with a
+  # field manager: if the object already exists it is merged in place
+  # (with --force-conflicts taking ownership of any fields KAITO set).
   echo "=== Installing GWIE CRDs ==="
-  kubectl apply -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml"
+  kubectl apply --server-side --force-conflicts \
+    -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml"
 }
 
 install_keda() {
@@ -430,18 +442,6 @@ install_llm_gateway_auth() {
   kubectl -n llm-gateway-auth rollout status deployment/apikey-authz --timeout=180s || true
 }
 
-install_model_not_found() {
-  # Cluster-shared catch-all 404 Service in `default`. Every workload
-  # namespace's modelharness release renders a catch-all HTTPRoute that
-  # forwards unmatched requests to this Service across namespaces,
-  # authorised by a per-namespace ReferenceGrant.
-  echo "=== Deploying cluster-shared model-not-found Service in default ==="
-  kubectl apply -f "${MANIFESTS_DIR}/model-not-found.yaml"
-
-  echo "⏳ Waiting for model-not-found service..."
-  kubectl -n default rollout status deployment/model-not-found --timeout=120s || true
-}
-
 # ── Phased execution ──────────────────────────────────────────────────────
 #
 # Per-namespace shared resources (Gateway, catch-all HTTPRoute,
@@ -456,8 +456,7 @@ run_phase phase1-base \
   install_keda \
   install_keda_kaito_scaler \
   install_gpu_mocker \
-  prefetch_bbr_chart \
-  install_model_not_found
+  prefetch_bbr_chart
 
 run_phase phase2-istio \
   install_istio
diff --git a/hack/e2e/scripts/validate-components.sh b/hack/e2e/scripts/validate-components.sh
index 2ecab86..57d2b6f 100755
--- a/hack/e2e/scripts/validate-components.sh
+++ b/hack/e2e/scripts/validate-components.sh
@@ -79,22 +79,9 @@ fi
 kubectl -n istio-system get pods -l app=body-based-router 2>/dev/null || true
 echo ""
 
-# ── Cluster-shared model-not-found backend ──────────────────────────────
-# After the modelharness refactor, per-namespace Istio Gateways
-# ("<namespace>-gw") are provisioned at test time by EnsureNamespace
-# (charts/modelharness), so no `inference-gateway` Gateway pod exists in
-# `default` to validate at install time. The only namespace-tier
-# component install-components.sh still pre-installs is the
-# cluster-shared 404 Service that every workload namespace's catch-all
-# HTTPRoute references via a ReferenceGrant — validate that here.
-echo "=== model-not-found (cluster-shared 404 backend) ==="
-if kubectl -n default wait --for=condition=ready pod -l app=model-not-found --timeout="${TIMEOUT}" >/dev/null 2>&1; then
-  pass "model-not-found pod is Running"
-else
-  fail "model-not-found pod is NOT Running"
-fi
-kubectl -n default get pods -l app=model-not-found 2>/dev/null || true
-echo ""
+# (Catch-all 404 handling is now produced by an EnvoyFilter
+# direct_response rendered per-namespace by charts/modelharness — no
+# cluster-shared Service exists to validate.)
 
 # ── KEDA ─────────────────────────────────────────────────────────────────
 echo "=== KEDA (namespace: ${KEDA_NAMESPACE}, provider: ${E2E_PROVIDER}) ==="
diff --git a/test/e2e/README.md b/test/e2e/README.md
index 7f5b660..5121cc4 100644
--- a/test/e2e/README.md
+++ b/test/e2e/README.md
@@ -15,7 +15,7 @@ Single source of truth: [`cases.go`](cases.go) → `CaseDeployments`. Each entry
 
 `Name` is unique cluster-wide and is the value matched by `X-Gateway-Model-Name` (i.e. the `model` field clients send in OpenAI-compatible requests). `Model` is the KAITO preset only — multiple deployments may share a preset under different `Name`s.
 
-Inference tests target the case's **`caseGatewayURL`**. Each case namespace gets its own Gateway, catch-all `model-not-found` route, and (when enabled) API-key auth artifacts via the [`charts/modelharness`](../../charts/modelharness) chart installed by `EnsureNamespace`.
+Inference tests target the case's **`caseGatewayURL`**. Each case namespace gets its own Gateway, catch-all `model-not-found-direct` EnvoyFilter (Envoy `direct_response` 404), and (when enabled) API-key auth artifacts via the [`charts/modelharness`](../../charts/modelharness) chart installed by `EnsureNamespace`.
 
 ## Helpers
 
@@ -159,7 +159,7 @@ var GinkgoLabelMyFeature = ginkgo.Label("MyFeature")
 
 ### 5. Add per-namespace resources (rare)
 
-If your case needs additional cluster-side resources beyond what the [`charts/modelharness`](../../charts/modelharness) chart already provisions (Gateway, catch-all `model-not-found` Service + HTTPRoute, optional `AuthorizationPolicy` + `APIKey`), add them as templates in `charts/modelharness` so every workload namespace picks them up consistently.
+If your case needs additional cluster-side resources beyond what the [`charts/modelharness`](../../charts/modelharness) chart already provisions (Gateway, catch-all `model-not-found-direct` EnvoyFilter, optional `AuthorizationPolicy` + `APIKey`), add them as templates in `charts/modelharness` so every workload namespace picks them up consistently.
 
 ### 6. Validate
 
diff --git a/test/e2e/gpu_mocker_test.go b/test/e2e/gpu_mocker_test.go
index 322d2bb..3ff95a4 100644
--- a/test/e2e/gpu_mocker_test.go
+++ b/test/e2e/gpu_mocker_test.go
@@ -330,14 +330,14 @@ var _ = Describe("GPU Mocker E2E", Ordered, func() {
 
 		Context("Non-existent model request", func() {
 			It("should return 404 with an OpenAI-compatible error for an unknown model", func() {
-				// The catch-all model-not-found HTTPRoute is provisioned
-				// per-namespace by the modelharness chart (installed via
-				// EnsureNamespace) and forwards unmatched requests across
-				// namespaces to the cluster-shared `default/model-not-found`
-				// Service (authorised by a ReferenceGrant). The gpu-mocker
-				// case has AuthAPIKeyEnabled=false, so no
-				// AuthorizationPolicy is rendered and the probe needs no
-				// bearer token.
+				// The catch-all `model-not-found-direct` EnvoyFilter is
+				// provisioned per-namespace by the modelharness chart
+				// (installed via EnsureNamespace) and patches an Envoy
+				// `direct_response` (status 404 + OpenAI-compatible JSON) onto
+				// the Gateway's virtual host as a catch-all route. No backend
+				// Pod / Service is involved. The gpu-mocker case has
+				// AuthAPIKeyEnabled=false, so no AuthorizationPolicy is
+				// rendered and the probe needs no bearer token.
 				resp, err := utils.SendChatCompletion(caseGatewayURL, "non-existent-model-xyz")
 				Expect(err).NotTo(HaveOccurred())
 				Expect(resp.StatusCode).To(Equal(http.StatusNotFound))
diff --git a/test/e2e/model_routing_test.go b/test/e2e/model_routing_test.go
index 718d453..cfc334f 100644
--- a/test/e2e/model_routing_test.go
+++ b/test/e2e/model_routing_test.go
@@ -28,7 +28,6 @@ import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/client-go/kubernetes"
 
 	"github.com/kaito-project/production-stack/test/e2e/utils"
 )
@@ -48,7 +47,8 @@ import (
 //   - Istio Gateway with Body-Based Routing (BBR) configured
 //   - At least two KAITO InferenceSets serving different models
 //   - GPU node mocker creating shadow pods with llm-d-inference-sim
-//   - model-not-found catch-all HTTPRoute deployed
+//   - Catch-all `model-not-found-direct` EnvoyFilter (Envoy
+//     direct_response, rendered per-namespace by charts/modelharness)
 
 var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func() {
 	// Per-case deployments owned by model_routing_test.go (see cases.go).
@@ -73,8 +73,9 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func(
 	var ctx context.Context
 
 	// caseGatewayURL routes to this case's dedicated Gateway. Resolved
-	// in BeforeAll. Per-namespace catch-all model-not-found / Gateway
-	// resources are provisioned by the modelharness chart.
+	// in BeforeAll. Per-namespace Gateway + catch-all
+	// `model-not-found-direct` EnvoyFilter are provisioned by the
+	// modelharness chart.
 	var caseGatewayURL string
 
 	BeforeAll(func() {
@@ -262,41 +263,26 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func(
 	})
 
 	Context("Model-specific route wins over catch-all", func() {
-		It("should not route known model requests to the model-not-found service", func() {
-			clientset, err := utils.GetK8sClientset()
-			Expect(err).NotTo(HaveOccurred())
-
-			By("recording model-not-found pod request count before test")
-			// model-not-found is a cluster-shared Service installed once in
-			// utils.ModelNotFoundNamespace by
-			// hack/e2e/scripts/install-components.sh; every workload
-			// namespace's catch-all HTTPRoute references it via a
-			// ReferenceGrant rendered by charts/modelharness.
-			mnfPods, err := clientset.CoreV1().Pods(utils.ModelNotFoundNamespace).List(ctx, metav1.ListOptions{
-				LabelSelector: utils.ModelNotFoundPodLabel,
-				FieldSelector: "status.phase=Running",
-			})
-			Expect(err).NotTo(HaveOccurred())
-			Expect(mnfPods.Items).NotTo(BeEmpty(), "model-not-found pods should be running")
-
-			// Capture the nginx access log line count before traffic.
-			mnfPodName := mnfPods.Items[0].Name
-			beforeLogCount := countNginxAccessLogs(clientset, utils.ModelNotFoundNamespace, mnfPodName)
-
+		// The catch-all is now an Envoy `direct_response` (status 404)
+		// patched onto the Gateway by the `model-not-found-direct`
+		// EnvoyFilter (charts/modelharness/templates/envoyfilter-not-found.yaml).
+		// If the catch-all ever hijacked a known-model request, the
+		// response would be 404 instead of 200, and the model field in
+		// the body would be missing. Assert both directly.
+		It("should not absorb known model requests into the catch-all 404", func() {
 			By("sending requests to known models")
 			for _, model := range modelNames {
 				for i := 0; i < 3; i++ {
 					resp, err := sendChat(caseGatewayURL, model)
 					Expect(err).NotTo(HaveOccurred())
-					Expect(resp.StatusCode).To(Equal(http.StatusOK))
-					resp.Body.Close()
+					Expect(resp.StatusCode).To(Equal(http.StatusOK),
+						"known-model request to %s should not fall through to catch-all 404", model)
+					parsed, perr := utils.ParseChatCompletionResponse(resp)
+					Expect(perr).NotTo(HaveOccurred())
+					Expect(parsed.Model).To(Equal(model),
+						"response for %s should echo the requested model name", model)
 				}
 			}
-
-			By("verifying model-not-found service received no new requests")
-			afterLogCount := countNginxAccessLogs(clientset, utils.ModelNotFoundNamespace, mnfPodName)
-			Expect(afterLogCount).To(Equal(beforeLogCount),
-				"model-not-found service should not have received any requests for known models")
 		})
 	})
 
@@ -663,24 +649,6 @@ var _ = Describe("Model-Based Routing", Ordered, utils.GinkgoLabelRouting, func(
 	})
 })
 
-// countNginxAccessLogs counts POST request lines in the nginx access log.
-// Only POST lines are counted to exclude Kubernetes health probe traffic
-// (GET /) which would cause false positives.
-func countNginxAccessLogs(clientset *kubernetes.Clientset, namespace, podName string) int {
-	logs, err := utils.GetPodLogs(clientset, namespace, podName, "nginx")
-	if err != nil {
-		return 0
-	}
-	count := 0
-	for _, line := range strings.Split(logs, "\n") {
-		line = strings.TrimSpace(line)
-		if line != "" && strings.Contains(line, "\"POST ") {
-			count++
-		}
-	}
-	return count
-}
-
 // findDebugLogTriple searches istio-ingressgateway logs for a request-id that
 // has [PRE-BBR], [POST-EPP], and [RESPONSE] lines, where the POST-EPP line
 // contains the expected model name. Returns the three log lines.
diff --git a/test/e2e/production-stack-E2E-test-scenarios.md b/test/e2e/production-stack-E2E-test-scenarios.md
index 66cdce3..21a3bfe 100644
--- a/test/e2e/production-stack-E2E-test-scenarios.md
+++ b/test/e2e/production-stack-E2E-test-scenarios.md
@@ -144,7 +144,7 @@ the correct pool received the request. EPP-side counters corroborate the schedul
 * Correct model name for ministral — Same validation for the second model; catches per-pool misrouting.
 * Cross-model isolation (serial) — Send N requests for each model sequentially. Scrape `vllm:request_success_total{model_name}` from every shadow pod before and after. Verify that phi pods' counters only incremented for phi requests, and ministral pods' counters only incremented for ministral requests. No cross-pool contamination.
 * Cross-model isolation (concurrent) — Launch **interleaved concurrent** traffic: 20 in-flight phi + 20 in-flight ministral requests at the same time. Verify per-pod `vllm:request_success_total{model_name}` still shows zero cross-contamination. BBR and EPP are two chained ext_proc filters; serial tests cannot expose header-state leakage between concurrent requests within the same Envoy worker.
-* Model-specific route wins over catch-all — While the catch-all `model-not-found` HTTPRoute is deployed (see *Unknown model handling*), requests with a known model name must never hit it. Verify by scraping the `model-not-found` Service's request counter / access log: it must stay at 0 during the above cross-model runs. Guards against HTTPRoute ordering regressions where the catch-all rule silently absorbs valid traffic.
+* Model-specific route wins over catch-all — The catch-all `model-not-found-direct` EnvoyFilter (see *Unknown model handling*) patches an Envoy `direct_response` (status 404) onto the Gateway's virtual host. Requests with a known model name must never hit it: if they did, the response would be 404 instead of 200. Verify by sending requests for each known model and asserting `HTTP 200` plus a body whose `model` field matches the request. Guards against route-ordering regressions where the catch-all rule silently absorbs valid traffic.
 * EPP routing success (metrics) — After the above runs, verify `inference_extension_scheduler_attempts_total{status="success"}` increased by the total requests sent and `{status="failure"}` did not change. Verify `inference_objective_request_total{model_name="routing-phi"}` and `{model_name="routing-ministral"}` match the per-model counts. Proves EPP actively scheduled each request rather than falling through to a default route.
 * Load distribution — With 2 replicas per pool, send 20+ requests per model. Scrape `vllm:request_success_total` from each pod and verify no pod received 0 requests and none received more than 80% of its pool's traffic. Cross-check with `inference_pool_per_pod_queue_size{name, model_server_pod}` to confirm both pods were active. If one pod gets all traffic, EPP's scoring or endpoint list is broken.
 * Debug EnvoyFilter log chain — For **one** representative request in each of the cases above (phi, ministral, concurrent cross-model), tail istio-ingressgateway logs and verify the `inference-debug-filter` Lua chain emitted exactly one `[PRE-BBR]`, one `[POST-EPP]`, and one `[RESPONSE]` line sharing the same `x-request-id`. In the `[POST-EPP]` line, `x-gateway-model-name` equals the request's model field (proves BBR ran) and `x-gateway-destination-endpoint` is a non-empty `IP:port` matching the pod that actually served the request per `vllm:request_success_total` (proves EPP ran and its decision was honoured). Health-check `GET /` traffic must not produce these log lines. This folds the debug/observability surface into the main routing assertions so filter-chain ordering regressions (e.g., Istio upgrade) cannot silently break on-cluster debugging.
@@ -216,7 +216,7 @@ individually so failures can be localised.
 Verifies the Gateway's client-compatibility contract for bad inputs: unknown models hit the catch-all
 JSON 404, and malformed bodies do not crash BBR or leak Envoy's raw error pages.
 
-* 404 for unknown model — Request with `{"model": "does-not-exist", ...}` returns HTTP 404 with the OpenAI-compatible JSON body `{"error":{"code":"model_not_found", ...}}` served by the `model-not-found` Service — not Envoy's raw 404 HTML.
+* 404 for unknown model — Request with `{"model": "does-not-exist", ...}` returns HTTP 404 with the OpenAI-compatible JSON body `{"error":{"code":"model_not_found", ...}}` produced by the per-namespace `model-not-found-direct` EnvoyFilter (Envoy `direct_response`) — not Envoy's raw 404 HTML.
 * Missing `model` field — Request body `{"messages": [...]}` with no `model` key. BBR cannot inject `x-gateway-model-name`; verify the request is handled predictably (routed to the catch-all 404 JSON, not a 500 or hang).
 * Non-string `model` field — Body with `{"model": 42, ...}`. Verify BBR rejects or falls through cleanly (catch-all 404 JSON), with no Envoy 5xx.
 * Non-JSON body on `/v1/*` — Send `text/plain` or truncated JSON to `POST /v1/chat/completions`. Verify response is a well-formed error (4xx), the BBR ext_proc filter does not crash (Envoy stays up, subsequent valid requests succeed), and no goroutine leak appears in BBR logs.
diff --git a/test/e2e/utils/dynamic.go b/test/e2e/utils/dynamic.go
index aaee21a..08f7971 100644
--- a/test/e2e/utils/dynamic.go
+++ b/test/e2e/utils/dynamic.go
@@ -57,23 +57,6 @@ var (
 		Kind:    "Gateway",
 	}
 
-	// HTTPRouteGVK identifies HTTPRoute resources (used to create the
-	// per-namespace catch-all that returns OpenAI-compatible 404 JSON).
-	HTTPRouteGVK = schema.GroupVersionKind{
-		Group:   "gateway.networking.k8s.io",
-		Version: "v1",
-		Kind:    "HTTPRoute",
-	}
-
-	// ReferenceGrantGVK identifies the ReferenceGrant used to permit a
-	// per-case HTTPRoute to reference the shared model-not-found Service
-	// living in the default namespace.
-	ReferenceGrantGVK = schema.GroupVersionKind{
-		Group:   "gateway.networking.k8s.io",
-		Version: "v1beta1",
-		Kind:    "ReferenceGrant",
-	}
-
 	// AuthorizationPolicyGVK identifies the Istio AuthorizationPolicy used
 	// to wire each per-case Gateway into the apikey-ext-authz CUSTOM
 	// provider. The upstream llm-gateway-apikey chart only installs an AP
diff --git a/test/e2e/utils/helm.go b/test/e2e/utils/helm.go
index 4ea1c75..c223c28 100644
--- a/test/e2e/utils/helm.go
+++ b/test/e2e/utils/helm.go
@@ -190,8 +190,9 @@ func modelHarnessChartPath() string {
 
 // InstallModelHarness runs `helm upgrade --install` for the modelharness chart
 // in `namespace`. It provisions the per-namespace Gateway (named
-// "<namespace>-gw" by chart default), the catch-all model-not-found
-// HTTPRoute + ReferenceGrant, and — when authEnabled is true — the
+// "<namespace>-gw" by chart default), the catch-all
+// `model-not-found-direct` EnvoyFilter (Envoy `direct_response` returning
+// 404 + OpenAI-compatible JSON), and — when authEnabled is true — the
 // per-namespace AuthorizationPolicy + APIKey CR. When
 // networkPolicyEnabled is true, the chart additionally renders the
 // default-deny-ingress / allow-inference-traffic NetworkPolicies that
diff --git a/test/e2e/utils/http.go b/test/e2e/utils/http.go
index 0d3c7de..eb0ce79 100644
--- a/test/e2e/utils/http.go
+++ b/test/e2e/utils/http.go
@@ -39,17 +39,6 @@ const (
 	// HTTPTimeout is the default timeout for HTTP requests.
 	// Set high to account for BBR/EPP ext_proc startup latency.
 	HTTPTimeout = 60 * time.Second
-
-	// ModelNotFoundNamespace is the namespace that hosts the cluster-shared
-	// `model-not-found` Deployment + Service installed once per cluster by
-	// hack/e2e/scripts/install-components.sh. Every workload namespace's
-	// catch-all HTTPRoute (rendered by charts/modelharness) forwards
-	// unmatched requests here via a cross-namespace ReferenceGrant.
-	ModelNotFoundNamespace = "default"
-
-	// ModelNotFoundPodLabel is the label selector used to list the
-	// model-not-found Pods (e.g. when scraping nginx access logs).
-	ModelNotFoundPodLabel = "app=model-not-found"
 )
 
 // ChatCompletionRequest represents an OpenAI-compatible chat completion request body.
diff --git a/test/e2e/utils/setup.go b/test/e2e/utils/setup.go
index 6b5bfbb..65602c3 100644
--- a/test/e2e/utils/setup.go
+++ b/test/e2e/utils/setup.go
@@ -33,7 +33,9 @@ import (
 // EnsureNamespace creates the namespace if it does not exist and installs
 // the modelharness Helm chart into it. modelharness owns every per-namespace
 // shared resource: the Istio Gateway (named "<name>-gw" by chart
-// default), the catch-all model-not-found HTTPRoute + ReferenceGrant,
+// default), the catch-all `model-not-found-direct` EnvoyFilter (Envoy
+// `direct_response` returning 404 + OpenAI-compatible JSON for any
+// request not matched by a deployment-specific HTTPRoute),
 // — when authEnabled is true — the AuthorizationPolicy + APIKey CR
 // that wire the Gateway into the cluster-wide apikey-ext-authz CUSTOM
 // provider, and — when networkPolicyEnabled is true — the