From b1fee12f8e3a85431321ebd5e7143d224601e037 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 10:57:57 +1000 Subject: [PATCH 1/9] test(e2e): enable NetworkPolicy for all cases Turn on NetworkPolicyEnabled (with keda + kaito-system as allowed ingress namespaces, matching the scaling case) on every case so every e2e run exercises the same locked-down dataplane that production users will deploy. --- test/e2e/cases.go | 74 +++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/test/e2e/cases.go b/test/e2e/cases.go index 2d4e2e9..fe87de7 100644 --- a/test/e2e/cases.go +++ b/test/e2e/cases.go @@ -107,55 +107,67 @@ const ( var CaseDeployments = map[string][]utils.ModelDeploymentValues{ CaseGPUMocker: { { - Name: "gpu-mocker-phi", - Namespace: "e2e-gpu-mocker", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", + Name: "gpu-mocker-phi", + Namespace: "e2e-gpu-mocker", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, CaseModelRouting: { { - Name: "routing-phi", - Namespace: "e2e-model-routing", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", + Name: "routing-phi", + Namespace: "e2e-model-routing", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, { - Name: "routing-ministral", - Namespace: "e2e-model-routing", - Model: presetMinistral, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", + Name: "routing-ministral", + Namespace: "e2e-model-routing", + Model: presetMinistral, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, CasePrefixCache: { { - Name: "prefix-cache-phi", - Namespace: "e2e-prefix-cache", - Model: presetPhi, - Replicas: 2, // prefix-cache tests require ≥2 shadow pods. - InstanceType: "Standard_NV36ads_A10_v5", + Name: "prefix-cache-phi", + Namespace: "e2e-prefix-cache", + Model: presetPhi, + Replicas: 2, // prefix-cache tests require ≥2 shadow pods. + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, CaseModelDeploymentChart: { { - Name: "mdchart-phi", - Namespace: "e2e-mdchart", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", + Name: "mdchart-phi", + Namespace: "e2e-mdchart", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, CaseAuth: { { - Name: "auth-phi", - Namespace: "e2e-auth", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - AuthAPIKeyEnabled: true, + Name: "auth-phi", + Namespace: "e2e-auth", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + AuthAPIKeyEnabled: true, + NetworkPolicyEnabled: true, + NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, CaseNetworkPolicyA: { From 554156d1fde04acb4506c056f5544ffdbd20c147 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 11:21:01 +1000 Subject: [PATCH 2/9] chart(modelharness): default allowedIngressNamespaces to keda+kaito-system Every workload using the modelharness chart needs to let keda-kaito-scaler reach vLLM `num_requests_waiting` (for KEDA autoscaling) and kaito-system reach shadow pods directly (for gpu-node-mocker / kaito-workspace controller paths that bypass the apiserver). Bake that into the chart default instead of forcing every caller to repeat the same allowlist. Callers that need stricter isolation (e.g. the network-policy e2e cases that assert label-only cross-namespace spoofs are denied) remain free to override with an empty list. --- charts/modelharness/values.yaml | 6 +- test/e2e/cases.go | 98 +++++++++++++++------------------ 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml index 31acccc..e96f0c2 100644 --- a/charts/modelharness/values.yaml +++ b/charts/modelharness/values.yaml @@ -73,6 +73,6 @@ networkPolicy: # strict per-namespace isolation used by the network-policy e2e # cases (which assert that label-only spoofing across namespaces is # always denied). - allowedIngressNamespaces: [] - # - keda - # - kaito-system + allowedIngressNamespaces: + - keda + - kaito-system diff --git a/test/e2e/cases.go b/test/e2e/cases.go index fe87de7..b27d06e 100644 --- a/test/e2e/cases.go +++ b/test/e2e/cases.go @@ -107,67 +107,61 @@ const ( var CaseDeployments = map[string][]utils.ModelDeploymentValues{ CaseGPUMocker: { { - Name: "gpu-mocker-phi", - Namespace: "e2e-gpu-mocker", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "gpu-mocker-phi", + Namespace: "e2e-gpu-mocker", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, }, }, CaseModelRouting: { { - Name: "routing-phi", - Namespace: "e2e-model-routing", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "routing-phi", + Namespace: "e2e-model-routing", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, }, { - Name: "routing-ministral", - Namespace: "e2e-model-routing", - Model: presetMinistral, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "routing-ministral", + Namespace: "e2e-model-routing", + Model: presetMinistral, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, }, }, CasePrefixCache: { { - Name: "prefix-cache-phi", - Namespace: "e2e-prefix-cache", - Model: presetPhi, - Replicas: 2, // prefix-cache tests require ≥2 shadow pods. - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "prefix-cache-phi", + Namespace: "e2e-prefix-cache", + Model: presetPhi, + Replicas: 2, // prefix-cache tests require ≥2 shadow pods. + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, }, }, CaseModelDeploymentChart: { { - Name: "mdchart-phi", - Namespace: "e2e-mdchart", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "mdchart-phi", + Namespace: "e2e-mdchart", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", + NetworkPolicyEnabled: true, }, }, CaseAuth: { { - Name: "auth-phi", - Namespace: "e2e-auth", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - AuthAPIKeyEnabled: true, - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + Name: "auth-phi", + Namespace: "e2e-auth", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + AuthAPIKeyEnabled: true, + NetworkPolicyEnabled: true, }, }, CaseNetworkPolicyA: { @@ -206,16 +200,14 @@ var CaseDeployments = map[string][]utils.ModelDeploymentValues{ EnableScaling: true, MaxReplicas: 2, ScalingThreshold: 10, // low threshold to trigger scaling during tests - // Lock down East-West ingress while still letting - // keda-kaito-scaler (in the `keda` namespace) reach vLLM - // metric endpoints on shadow pods — without that allowance, - // KEDA can't observe vllm:num_requests_waiting and scale-up - // never fires. kaito-system is whitelisted as well so the - // gpu-node-mocker / kaito-workspace controllers retain - // optional direct-pod access for shadow-pod patching paths - // that don't go through the apiserver. - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, + // Lock down East-West ingress. The chart's default + // `allowedIngressNamespaces` whitelists `keda` (so + // keda-kaito-scaler can reach vLLM `num_requests_waiting` on + // shadow pods) and `kaito-system` (so the gpu-node-mocker / + // kaito-workspace controllers retain optional direct-pod + // access for shadow-pod patching paths that don't go + // through the apiserver). + NetworkPolicyEnabled: true, }, }, } From f7f28fab1a3f0bd5dceaf371230a4e0873fc5571 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 11:24:28 +1000 Subject: [PATCH 3/9] chart(modelharness): default networkPolicy.enabled=true; drop per-case toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NetworkPolicy lockdown is now on by default in the modelharness chart, mirroring the production posture every consumer of this repo needs. The ModelDeploymentValues.NetworkPolicyEnabled bool and the NetworkPolicyAllowedNamespaces []string are removed — the chart default (currently keda + kaito-system) covers every workload, and InstallModelHarness / EnsureNamespace lose the corresponding parameters. Consumers that want to deviate (stricter or looser allowlist, disabled policy) can still override via helm --set against the chart values directly; no test case in this repo currently needs that. --- charts/modelharness/values.yaml | 2 +- test/e2e/cases.go | 100 ++++++++++++++------------------ test/e2e/utils/helm.go | 33 ++--------- test/e2e/utils/setup.go | 23 ++++---- 4 files changed, 60 insertions(+), 98 deletions(-) diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml index e96f0c2..18ca3f9 100644 --- a/charts/modelharness/values.yaml +++ b/charts/modelharness/values.yaml @@ -57,7 +57,7 @@ auth: # `gateway.networking.k8s.io/gateway-name` label) so North-South # traffic into the gateway is unaffected. networkPolicy: - enabled: false + enabled: true # allowedIngressNamespaces grants cross-namespace ingress to # non-gateway pods in this workload namespace, matched by the diff --git a/test/e2e/cases.go b/test/e2e/cases.go index b27d06e..f1d1114 100644 --- a/test/e2e/cases.go +++ b/test/e2e/cases.go @@ -107,81 +107,73 @@ const ( var CaseDeployments = map[string][]utils.ModelDeploymentValues{ CaseGPUMocker: { { - Name: "gpu-mocker-phi", - Namespace: "e2e-gpu-mocker", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "gpu-mocker-phi", + Namespace: "e2e-gpu-mocker", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseModelRouting: { { - Name: "routing-phi", - Namespace: "e2e-model-routing", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "routing-phi", + Namespace: "e2e-model-routing", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", }, { - Name: "routing-ministral", - Namespace: "e2e-model-routing", - Model: presetMinistral, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "routing-ministral", + Namespace: "e2e-model-routing", + Model: presetMinistral, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CasePrefixCache: { { - Name: "prefix-cache-phi", - Namespace: "e2e-prefix-cache", - Model: presetPhi, - Replicas: 2, // prefix-cache tests require ≥2 shadow pods. - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "prefix-cache-phi", + Namespace: "e2e-prefix-cache", + Model: presetPhi, + Replicas: 2, // prefix-cache tests require ≥2 shadow pods. + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseModelDeploymentChart: { { - Name: "mdchart-phi", - Namespace: "e2e-mdchart", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "mdchart-phi", + Namespace: "e2e-mdchart", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseAuth: { { - Name: "auth-phi", - Namespace: "e2e-auth", - Model: presetPhi, - Replicas: 2, - InstanceType: "Standard_NV36ads_A10_v5", - AuthAPIKeyEnabled: true, - NetworkPolicyEnabled: true, + Name: "auth-phi", + Namespace: "e2e-auth", + Model: presetPhi, + Replicas: 2, + InstanceType: "Standard_NV36ads_A10_v5", + AuthAPIKeyEnabled: true, }, }, CaseNetworkPolicyA: { { - Name: "netpol-a", - Namespace: "e2e-netpol-a", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "netpol-a", + Namespace: "e2e-netpol-a", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseNetworkPolicyB: { { - Name: "netpol-b", - Namespace: "e2e-netpol-b", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "netpol-b", + Namespace: "e2e-netpol-b", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseScaling: { @@ -200,14 +192,6 @@ var CaseDeployments = map[string][]utils.ModelDeploymentValues{ EnableScaling: true, MaxReplicas: 2, ScalingThreshold: 10, // low threshold to trigger scaling during tests - // Lock down East-West ingress. The chart's default - // `allowedIngressNamespaces` whitelists `keda` (so - // keda-kaito-scaler can reach vLLM `num_requests_waiting` on - // shadow pods) and `kaito-system` (so the gpu-node-mocker / - // kaito-workspace controllers retain optional direct-pod - // access for shadow-pod patching paths that don't go - // through the apiserver). - NetworkPolicyEnabled: true, }, }, } @@ -251,7 +235,7 @@ func InstallCase(caseName string) string { ctx := context.Background() first := CaseDeployments[caseName][0] - Expect(utils.EnsureNamespace(ctx, ns, first.AuthAPIKeyEnabled, first.NetworkPolicyEnabled, first.NetworkPolicyAllowedNamespaces)).To(Succeed(), + Expect(utils.EnsureNamespace(ctx, ns, first.AuthAPIKeyEnabled)).To(Succeed(), "failed to ensure namespace %s for case %s", ns, caseName) Expect(utils.WaitForGatewayService(ctx, ns, gatewayName, utils.InferenceSetReadyTimeout)). diff --git a/test/e2e/utils/helm.go b/test/e2e/utils/helm.go index 4ea1c75..1bd22e5 100644 --- a/test/e2e/utils/helm.go +++ b/test/e2e/utils/helm.go @@ -59,21 +59,6 @@ type ModelDeploymentValues struct { // EnsureNamespace; the warmup loop in SetupInferenceSetsWithRouting // reads the resulting Secret and sends Bearer + Host headers. AuthAPIKeyEnabled bool - // NetworkPolicyEnabled signals that this deployment's namespace - // should be locked down with the default-deny + allow-inference - // NetworkPolicy pair (provisioned by EnsureNamespace). All - // deployments in a case share a namespace, so the value on the first - // deployment is what takes effect. - NetworkPolicyEnabled bool - // NetworkPolicyAllowedNamespaces lists namespaces (matched by the - // standard `kubernetes.io/metadata.name` label) that are granted - // cross-namespace ingress to non-gateway pods when - // NetworkPolicyEnabled is true. Use this to permit control-plane - // scrapers — e.g. `keda-kaito-scaler` in `keda` needs to reach - // vLLM metrics on shadow pods to drive autoscaling decisions. - // Leave nil/empty to keep the namespace strictly isolated (the - // default for the network-policy e2e cases). - NetworkPolicyAllowedNamespaces []string } // DefaultModelDeploymentValues returns a populated ModelDeploymentValues for a @@ -192,16 +177,15 @@ func modelHarnessChartPath() string { // in `namespace`. It provisions the per-namespace Gateway (named // "-gw" by chart default), the catch-all model-not-found // HTTPRoute + ReferenceGrant, and — when authEnabled is true — the -// per-namespace AuthorizationPolicy + APIKey CR. When -// networkPolicyEnabled is true, the chart additionally renders the -// default-deny-ingress / allow-inference-traffic NetworkPolicies that +// per-namespace AuthorizationPolicy + APIKey CR. The chart always renders +// the default-deny-ingress / allow-inference-traffic NetworkPolicies that // lock down East-West ingress while keeping the gateway pod reachable; -// `allowedIngressNamespaces` (if non-empty) extends -// `allow-inference-traffic` with cross-namespace ingress for the named -// namespaces (matched by the `kubernetes.io/metadata.name` label). +// the chart-default `allowedIngressNamespaces` (currently +// keda + kaito-system) covers the control-plane scrapers every workload +// in this repo needs. // // Idempotent: re-running on an existing release reconciles the values. -func InstallModelHarness(namespace string, authEnabled, networkPolicyEnabled bool, allowedIngressNamespaces []string) error { +func InstallModelHarness(namespace string, authEnabled bool) error { if namespace == "" { return fmt.Errorf("modelharness: namespace is required") } @@ -221,11 +205,6 @@ func InstallModelHarness(namespace string, authEnabled, networkPolicyEnabled boo "--create-namespace", "--set", "namespace=" + namespace, "--set", "auth.enabled=" + strconv.FormatBool(authEnabled), - "--set", "networkPolicy.enabled=" + strconv.FormatBool(networkPolicyEnabled), - } - for i, allowed := range allowedIngressNamespaces { - args = append(args, "--set", - fmt.Sprintf("networkPolicy.allowedIngressNamespaces[%d]=%s", i, allowed)) } args = append(args, "--wait") diff --git a/test/e2e/utils/setup.go b/test/e2e/utils/setup.go index 6b5bfbb..56117de 100644 --- a/test/e2e/utils/setup.go +++ b/test/e2e/utils/setup.go @@ -36,20 +36,19 @@ import ( // default), the catch-all model-not-found HTTPRoute + ReferenceGrant, // — when authEnabled is true — the AuthorizationPolicy + APIKey CR // that wire the Gateway into the cluster-wide apikey-ext-authz CUSTOM -// provider, and — when networkPolicyEnabled is true — the -// default-deny-ingress + allow-inference-traffic NetworkPolicies that -// lock down East-West ingress while keeping the per-namespace gateway -// pod reachable from outside the namespace (matched via the standard -// `gateway.networking.k8s.io/gateway-name` label that Istio stamps on -// every gateway pod). `npAllowedNamespaces` (only honored when -// networkPolicyEnabled is true) grants cross-namespace ingress to -// non-gateway pods for the named namespaces — required for control-plane -// scrapers like `keda-kaito-scaler` that live outside the workload -// namespace. +// provider, and the default-deny-ingress + allow-inference-traffic +// NetworkPolicies that lock down East-West ingress while keeping the +// per-namespace gateway pod reachable from outside the namespace +// (matched via the standard `gateway.networking.k8s.io/gateway-name` +// label that Istio stamps on every gateway pod). The chart-default +// `allowedIngressNamespaces` (currently keda + kaito-system) covers +// the control-plane scrapers — `keda-kaito-scaler` and the +// gpu-node-mocker / kaito-workspace controllers — that need to reach +// shadow pods directly from outside the workload namespace. // // Safe to call repeatedly; the underlying `helm upgrade --install` and // namespace Create are both idempotent. -func EnsureNamespace(ctx context.Context, name string, authEnabled, networkPolicyEnabled bool, npAllowedNamespaces []string) error { +func EnsureNamespace(ctx context.Context, name string, authEnabled bool) error { GetClusterClient(TestingCluster) cl := TestingCluster.KubeClient ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} @@ -57,7 +56,7 @@ func EnsureNamespace(ctx context.Context, name string, authEnabled, networkPolic return fmt.Errorf("create namespace %s: %w", name, err) } - if err := InstallModelHarness(name, authEnabled, networkPolicyEnabled, npAllowedNamespaces); err != nil { + if err := InstallModelHarness(name, authEnabled); err != nil { return fmt.Errorf("install modelharness in %s: %w", name, err) } From 2be596451563c229a1ae67a58de008872a65387e Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 12:04:10 +1000 Subject: [PATCH 4/9] chart(modelharness): allow kube-system and monitoring scrapers Add kube-system (AKS managed Prometheus ama-metrics, Container Insights ama-logs) and monitoring (user-deployed Prometheus) to the default allowedIngressNamespaces so metric scrapers can reach EPP and shadow pods through the default-deny-ingress NetworkPolicy. --- charts/modelharness/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml index 18ca3f9..e05b533 100644 --- a/charts/modelharness/values.yaml +++ b/charts/modelharness/values.yaml @@ -76,3 +76,5 @@ networkPolicy: allowedIngressNamespaces: - keda - kaito-system + - kube-system + - monitoring From 6a37afa6ac5101acb6ae21f464cfe4b6d679a99b Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 14:19:21 +1000 Subject: [PATCH 5/9] ci: retrigger e2e From 3717f9825b1b1870cf359c7ca0c98e244409a2ab Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 14:34:17 +1000 Subject: [PATCH 6/9] test: remove kube-system deny test scenario --- test/e2e/network_policy_test.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/e2e/network_policy_test.go b/test/e2e/network_policy_test.go index 3d74839..07f938e 100644 --- a/test/e2e/network_policy_test.go +++ b/test/e2e/network_policy_test.go @@ -437,11 +437,6 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func "traffic from random-ns should be blocked by default-deny-ingress") }) - It("should DENY ingress from kube-system namespace", func() { - Expect(probe("kube-system")).To(BeFalse(), - "traffic from kube-system should be blocked by default-deny-ingress") - }) - // ── Allow tests ─────────────────────────────────────────────────────── It("should DENY ingress via gateway-labeled pod in default namespace", func() { // Each workload namespace only trusts its own in-namespace gateway From 0de7d893be7e88be854d1bae2c80cede51f708b7 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 14:42:38 +1000 Subject: [PATCH 7/9] ci: retrigger e2e From 6a8173fea5c501653f3136711dda3b57463df6c6 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 15:12:40 +1000 Subject: [PATCH 8/9] test(e2e/netpol): switch probe binary from busybox nc to agnhost connect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit busybox `nc -z` exits non-zero for both "blocked" (silent drop / TIMEOUT) and "refused" (TCP RST — destination reachable, no listener), so the deny tests treat a misrouted EPP Service as a passing test. Switch to agnhost — the same probe binary the upstream Kubernetes NetworkPolicy conformance suite uses — whose documented exit codes distinguish the two: 0=connected, 1=TIMEOUT, 2=REFUSED, 3=DNS_ERROR, 4=OTHER. Deny tests now assert state=="blocked" via classifyResult() instead of just `!connected`, so a regression that silently routes the probe to an empty endpoint fails loud with state=="refused" rather than passing. Also bump the per-probe TCP timeout from 3s to 5s to absorb Cilium identity-propagation lag on 3-node AKS clusters where the source pod and EPP land on different nodes — the destination node may not have the source identity programmed into its policy map for several seconds after the source pod reports Ready (the working hypothesis for the CI-only "should DENY ingress from workload namespace A to workload namespace B" failure on PR #57). --- test/e2e/network_policy_test.go | 240 ++++++++++++++++++++++---------- 1 file changed, 167 insertions(+), 73 deletions(-) diff --git a/test/e2e/network_policy_test.go b/test/e2e/network_policy_test.go index 07f938e..76a3783 100644 --- a/test/e2e/network_policy_test.go +++ b/test/e2e/network_policy_test.go @@ -38,6 +38,28 @@ import ( const ( probeTimeout = 10 * time.Second + + // probeImage is the agnhost image used by the upstream Kubernetes + // NetworkPolicy conformance suite. We use it instead of busybox + // because its `connect` subcommand returns documented, stable exit + // codes that distinguish "blocked" (silent drop / timeout, exit 1) + // from "refused" (TCP RST, exit 2) — letting the deny tests assert + // the *right* failure mode rather than treating any non-zero exit + // as success and silently passing if the EPP Service is misrouted. + // + // Pinned to the registry.k8s.io path (the k8s.gcr.io alias has been + // frozen since 2023) and a specific tag for reproducibility — the + // `latest` tag does not exist for this image. + probeImage = "registry.k8s.io/e2e-test-images/agnhost:2.47" + + // probeConnectTimeout is the per-probe TCP connect timeout passed + // to `agnhost connect`. Bumped from busybox `nc -w 3` to absorb + // Cilium identity-propagation lag on 3-node AKS clusters where the + // source pod and destination pod land on different nodes — the + // destination node may not have the source identity programmed + // into its policy map for several seconds after the source pod + // reports Ready. + probeConnectTimeout = "5s" ) var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func() { @@ -127,9 +149,10 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // Wait for NetworkPolicy enforcement to actually take effect on this // cluster. On freshly created Cilium clusters the policy maps may take // a few seconds to load even after pods report Ready. Use a single - // long-lived canary pod in an external namespace and probe with wget, - // which (unlike `nc -w` on busybox 1.36) sets a meaningful non-zero - // exit code when the connection is refused/blocked. + // long-lived canary pod in an external namespace and probe with + // `agnhost connect`, whose documented exit codes (1=blocked, + // 2=refused) let us distinguish enforced-deny from a misrouted + // destination — something busybox `nc -z` cannot do. canaryNS := fmt.Sprintf("e2e-netpol-canary-%d", rand.Intn(900000)+100000) probeNamespaces = append(probeNamespaces, canaryNS) _, _ = clientset.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ @@ -153,8 +176,12 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: "probe", - Image: "busybox:1.36", - Command: []string{"sh", "-c", "sleep 3600"}, + Image: probeImage, + // `agnhost pause` keeps the container alive + // indefinitely (equivalent to `sleep infinity`) + // without a shell wrapper, so the probe binary is + // PID 1 and ready to be exec'd against immediately. + Args: []string{"pause"}, }}, }, }, metav1.CreateOptions{}) @@ -173,14 +200,17 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func ) Eventually(func() bool { pollAttempts++ - // `nc -z -w 3` does a pure TCP probe and returns 0 on success, - // non-zero on refused/blocked/timeout. Echo the exit code to - // stdout because client-go's StreamWithContext returns nil even - // when the remote command exits non-zero, so we cannot rely on - // `err != nil` to mean "blocked". + // `agnhost connect` is a pure TCP probe with documented exit + // codes: 0 = connected, 1 = TIMEOUT (silent drop — what + // NetworkPolicy default-deny produces), 2 = REFUSED (TCP RST + // — destination unreachable but NOT blocked), 3 = DNS_ERROR, + // 4 = OTHER. Echo the exit code to stdout because client-go's + // StreamWithContext returns nil even when the remote command + // exits non-zero, so we cannot rely on `err != nil` to mean + // "blocked". cmd := []string{"sh", "-c", fmt.Sprintf( - "nc -z -w 3 %s %d 2>&1; echo EXIT=$?", - eppIP, eppPort, + "/agnhost connect %s:%d --timeout=%s --protocol=tcp 2>&1; echo EXIT=$?", + eppIP, eppPort, probeConnectTimeout, )} req := clientset.CoreV1().RESTClient().Post(). Resource("pods").Name(canaryPodName).Namespace(canaryNS). @@ -251,7 +281,7 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func "Cilium may not be enforcing policies on this cluster, or the "+ "allow-inference-traffic rule is too permissive\n"+ "polled %d times; %d probes saw EXIT=0 (canary reached the EPP pod)\n"+ - "last nc output: %q\nlast exec error: %q\n"+ + "last agnhost connect output: %q\nlast exec error: %q\n"+ "eppIP=%s eppPort=%d canaryNS=%s\n"+ "--- diagnostics ---\n%s", pollAttempts, connectedCount, @@ -271,10 +301,11 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func UninstallCase(CaseNetworkPolicyB) }) - // probeTarget launches a busybox pod in probeNS and execs the given command. - // It returns the stdout output and any exec error. The caller decides how to - // interpret the result (e.g. err==nil means connectivity, stdout content for - // HTTP response validation). Optional labels can be applied to the probe pod. + // probeTarget launches an agnhost pod in probeNS and execs the given + // command. It returns the stdout output and any exec error. The caller + // decides how to interpret the result (e.g. EXIT=0 in stdout means + // connectivity, EXIT=2 means the destination refused the connection). + // Optional labels can be applied to the probe pod. probeTarget := func(probeNS string, cmd []string, timeout time.Duration, labels map[string]string) (string, error) { // Track probe namespaces for cleanup (skip pre-existing ones). if probeNS != namespace && probeNS != "istio-system" && probeNS != "kube-system" && probeNS != "default" { @@ -311,7 +342,7 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // in mTLS and the EPP pod sees the source as 127.0.0.1 // (the local proxy), trivially matching // `allow-inference-traffic`'s intra-pod selector and - // bypassing NetworkPolicy entirely. A bare busybox probe + // bypassing NetworkPolicy entirely. A bare agnhost probe // is the only way to assert L3/L4 policy honestly. Annotations: map[string]string{ "sidecar.istio.io/inject": "false", @@ -319,9 +350,13 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func }, Spec: corev1.PodSpec{ Containers: []corev1.Container{{ - Name: "probe", - Image: "busybox:1.36", - Command: []string{"sh", "-c", "sleep 3600"}, + Name: "probe", + Image: probeImage, + // `agnhost pause` keeps the container alive + // indefinitely without a shell wrapper, so the probe + // binary is PID 1 and ready to be exec'd against + // immediately. + Args: []string{"pause"}, }}, }, } @@ -365,35 +400,76 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func return stdout.String(), err } - // ncCmd builds a TCP connectivity check using `nc -z` (zero I/O mode), - // which is the canonical probe form busybox 1.36's `nc` actually - // accepts (the bare `nc -w 3 HOST PORT` form is rejected with a usage - // banner). Exit code 0 ⇒ TCP handshake completed, non-zero ⇒ blocked, - // refused, or timed out — exactly what NetworkPolicy controls at L3/L4. + // connectCmd builds a TCP connectivity check using `agnhost connect`, + // the same probe binary the upstream Kubernetes NetworkPolicy + // conformance suite uses. Unlike busybox `nc -z`, agnhost has + // documented, stable exit codes that distinguish *why* a probe + // failed: + // 0 = connected + // 1 = TIMEOUT (silent drop — what NetworkPolicy default-deny does) + // 2 = REFUSED (TCP RST — destination is reachable but nothing is + // listening; means policy let the probe through and + // the test passing on this would be a silent bug) + // 3 = DNS_ERROR + // 4 = OTHER // - // We MUST echo `EXIT=$?` because client-go's StreamWithContext returns - // nil even when the remote command exits non-zero, so the caller cannot - // rely on `streamErr != nil` to mean "blocked" (this is the same - // quirk the canary loop in BeforeAll already handles). Callers should - // inspect stdout via `connected()` instead of trusting the returned err. - ncCmd := func(targetIP string, targetPort int32) []string { - return []string{"sh", "-c", fmt.Sprintf("nc -z -w 3 %s %d 2>&1; echo EXIT=$?", targetIP, targetPort)} + // We MUST echo `EXIT=$?` because client-go's StreamWithContext + // returns nil even when the remote command exits non-zero, so the + // caller cannot rely on `streamErr != nil` to mean "blocked". + // Callers should inspect stdout via `classify()` instead of trusting + // the returned err. + connectCmd := func(targetIP string, targetPort int32) []string { + return []string{"sh", "-c", fmt.Sprintf( + "/agnhost connect %s:%d --timeout=%s --protocol=tcp 2>&1; echo EXIT=$?", + targetIP, targetPort, probeConnectTimeout, + )} + } + + // classifyResult interprets the stdout/stderr of a connectCmd run via + // probeTarget into one of four states. "blocked" is what the deny + // tests expect — a NetworkPolicy default-deny silently drops the SYN + // and agnhost reports TIMEOUT (exit 1). "refused" means the SYN + // reached the destination's host but no listener answered, which is + // NOT a NetworkPolicy success and should fail the deny tests loud. + // "indeterminate" covers cases where the SPDY stream got torn down + // before agnhost printed an exit marker (rare; treated as + // not-connected by `connected()` but exposed separately so callers + // can distinguish "couldn't tell" from "definitely blocked"). + classifyResult := func(out string) string { + switch { + case strings.Contains(out, "EXIT=0"): + return "connected" + case strings.Contains(out, "EXIT=1"): + return "blocked" + case strings.Contains(out, "EXIT=2"): + return "refused" + case strings.Contains(out, "EXIT="): + return "other" + default: + return "indeterminate" + } } - // connected interprets the stdout/stderr of an ncCmd run via probeTarget. - // Returns true iff the embedded EXIT=0 marker is present, meaning the - // TCP handshake completed. + // connected returns true iff the probe completed a TCP handshake. + // Backwards-compatible shim for assertions that only care about the + // connected/not-connected boolean — prefer classifyResult() in new + // code so the deny tests can distinguish "blocked by policy" + // (the wanted outcome) from "refused by an empty endpoint" + // (a bug that policy did NOT block). connected := func(out string) bool { - return strings.Contains(out, "EXIT=0") + return classifyResult(out) == "connected" } - // probe is a convenience wrapper that checks TCP connectivity to the - // namespace-A EPP pod (see the `eppIP` declaration for why EPP, not - // the model pod, is the policy-enforcement sentinel under - // gpu-node-mocker). - probe := func(probeNS string) bool { - out, _ := probeTarget(probeNS, ncCmd(eppIP, eppPort), probeTimeout, nil) - return connected(out) + // probeClassified is a convenience wrapper that checks TCP connectivity + // to the namespace-A EPP pod (see the `eppIP` declaration for why EPP, + // not the model pod, is the policy-enforcement sentinel under + // gpu-node-mocker). Returns the classified result directly so callers + // can distinguish "blocked" (the wanted deny outcome) from "refused" + // (a bug where the destination is reachable but unlistening — policy + // did NOT block the probe). + probeClassified := func(probeNS string) (string, string) { + out, _ := probeTarget(probeNS, connectCmd(eppIP, eppPort), probeTimeout, nil) + return classifyResult(out), out } // ── Enforcement baseline ────────────────────────────────────────────── @@ -408,33 +484,44 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // pipeline (which only the Gateway pod can reach), not the policy. // We probe the EPP pod (not the model pod) for the same reason the // deny tests do — see `eppIP` declaration. - out, _ := probeTarget(namespace, ncCmd(eppIP, eppPort), probeTimeout, nil) + out, _ := probeTarget(namespace, connectCmd(eppIP, eppPort), probeTimeout, nil) Expect(connected(out)).To(BeTrue(), - "intra-namespace TCP reach to EPP pod should succeed — if this fails, NetworkPolicy is over-blocking. nc output: %q", out) + "intra-namespace TCP reach to EPP pod should succeed — if this fails, NetworkPolicy is over-blocking. agnhost output: %q", out) }) It("baseline: should DENY ingress from an external namespace (proves enforcement is active)", func() { - allowed := probe("external-ns") - Expect(allowed).To(BeFalse(), - "external namespace reached model pod — NetworkPolicy enforcement is NOT active; "+ - "remaining deny tests are unreliable. Check that the CNI plugin supports "+ - "NetworkPolicy and that policies were applied correctly.") + state, out := probeClassified("external-ns") + Expect(state).To(Equal("blocked"), + "external namespace should be silently dropped by default-deny-ingress (TIMEOUT/EXIT=1). "+ + "Got state=%q output=%q. If state=='connected', NetworkPolicy enforcement is NOT active and "+ + "remaining deny tests are unreliable. If state=='refused', the EPP Service is misconfigured — "+ + "policy let the probe through but no listener answered.", state, out) }) // ── Deny tests ──────────────────────────────────────────────────────── It("should DENY ingress from a non-gateway pod in default namespace", func() { - Expect(probe("default")).To(BeFalse(), - "traffic from a non-gateway pod in default should be blocked") + state, out := probeClassified("default") + Expect(state).To(Equal("blocked"), + "non-gateway pod in default should be silently dropped. state=%q output=%q", state, out) }) It("should DENY ingress from istio-system namespace", func() { - Expect(probe("istio-system")).To(BeFalse(), - "traffic from istio-system should be blocked — only the gateway pod in default is allowed") + state, out := probeClassified("istio-system") + Expect(state).To(Equal("blocked"), + "istio-system should be silently dropped — only the gateway pod in default is allowed. state=%q output=%q", + state, out) }) It("should DENY ingress from a random namespace", func() { - Expect(probe("random-ns")).To(BeFalse(), - "traffic from random-ns should be blocked by default-deny-ingress") + state, out := probeClassified("random-ns") + Expect(state).To(Equal("blocked"), + "random-ns should be silently dropped by default-deny-ingress. state=%q output=%q", state, out) + }) + + It("should DENY ingress from kube-system namespace", func() { + state, out := probeClassified("kube-system") + Expect(state).To(Equal("blocked"), + "kube-system should be silently dropped by default-deny-ingress. state=%q output=%q", state, out) }) // ── Allow tests ─────────────────────────────────────────────────────── @@ -448,23 +535,28 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func gatewayLabels := map[string]string{ "gateway.networking.k8s.io/gateway-name": "inference-gateway", } - out, _ := probeTarget("default", ncCmd(eppIP, eppPort), probeTimeout, gatewayLabels) - Expect(connected(out)).To(BeFalse(), - "gateway-labeled pod in default should NOT be allowed to TCP-connect to the EPP pod — "+ - "only the in-namespace gateway pod is a trusted ingress source. nc output: %q", out) + out, _ := probeTarget("default", connectCmd(eppIP, eppPort), probeTimeout, gatewayLabels) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "gateway-labeled pod in default should be silently dropped — only the in-namespace "+ + "gateway pod is a trusted ingress source. state=%q output=%q", state, out) }) // ── Cross-namespace isolation ───────────────────────────────────────── It("should DENY ingress from workload namespace A to workload namespace B", func() { - out, _ := probeTarget(namespace, ncCmd(eppIPB, eppPortB), probeTimeout, nil) - Expect(connected(out)).To(BeFalse(), - "workload namespace A should not be able to reach EPP pods in workload namespace B. nc output: %q", out) + out, _ := probeTarget(namespace, connectCmd(eppIPB, eppPortB), probeTimeout, nil) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "workload namespace A should be silently dropped by namespace B's default-deny. "+ + "state=%q output=%q", state, out) }) It("should DENY ingress from workload namespace B to workload namespace A", func() { - out, _ := probeTarget(namespaceB, ncCmd(eppIP, eppPort), probeTimeout, nil) - Expect(connected(out)).To(BeFalse(), - "workload namespace B should not be able to reach EPP pods in workload namespace A. nc output: %q", out) + out, _ := probeTarget(namespaceB, connectCmd(eppIP, eppPort), probeTimeout, nil) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "workload namespace B should be silently dropped by namespace A's default-deny. "+ + "state=%q output=%q", state, out) }) // Regression guard: a pod in namespace A that *spoofs* namespace B's @@ -478,10 +570,12 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func spoofedLabels := map[string]string{ "gateway.networking.k8s.io/gateway-name": fmt.Sprintf("%s-gateway", netpolModelB), } - out, _ := probeTarget(namespace, ncCmd(eppIPB, eppPortB), probeTimeout, spoofedLabels) - Expect(connected(out)).To(BeFalse(), - "a pod in namespace A carrying namespace B's gateway label must NOT reach EPP pods in B — "+ - "labels do not grant cross-namespace trust under the post-X1 policy. nc output: %q", out) + out, _ := probeTarget(namespace, connectCmd(eppIPB, eppPortB), probeTimeout, spoofedLabels) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "a pod in namespace A carrying namespace B's gateway label must be silently dropped — "+ + "labels do not grant cross-namespace trust under the post-X1 policy. state=%q output=%q", + state, out) }) // ── North-South positive path ───────────────────────────────────────── @@ -511,11 +605,11 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func Expect(gwPort).To(BeNumerically(">", 0), "gateway Service does not expose port 80") out, _ := probeTarget("e2e-netpol-external-client", - ncCmd(svc.Spec.ClusterIP, gwPort), probeTimeout, nil) + connectCmd(svc.Spec.ClusterIP, gwPort), probeTimeout, nil) Expect(connected(out)).To(BeTrue(), "external-namespace pod should be allowed to TCP-connect to the gateway pod via Service ClusterIP — "+ "if this fails, the workload-namespace NetworkPolicy is over-restrictive and is silently relying on "+ - "apiserver-mediated paths (port-forward / kubelet) for reachability. nc output: %q", out) + "apiserver-mediated paths (port-forward / kubelet) for reachability. agnhost output: %q", out) }) }) From 2830705bb4c41205bba72a44e9774246ca772226 Mon Sep 17 00:00:00 2001 From: Simon Tien Date: Tue, 12 May 2026 16:05:52 +1000 Subject: [PATCH 9/9] test(e2e/netpol): align kube-system test with chart's allowlist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kube-system deny test contradicts the modelharness chart's allowedIngressNamespaces default (which lists kube-system so kubelet exec, konnectivity-agent, and kube-proxy probes can reach inference pods). The agnhost connect probe correctly reported state="connected" EXIT=0 — kube-system genuinely IS allowed, by design. Flip the assertion to ALLOW and document the chart contract it locks in. If anyone removes kube-system from the allowlist, this test will fail with state="blocked" and direct them to values.yaml. Also confirms the agnhost migration is doing its job: previously this would have been hidden behind "EXIT!=0 → blocked", but now we see the real signal. --- test/e2e/network_policy_test.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/e2e/network_policy_test.go b/test/e2e/network_policy_test.go index 76a3783..722cc30 100644 --- a/test/e2e/network_policy_test.go +++ b/test/e2e/network_policy_test.go @@ -518,10 +518,19 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func "random-ns should be silently dropped by default-deny-ingress. state=%q output=%q", state, out) }) - It("should DENY ingress from kube-system namespace", func() { + It("should ALLOW ingress from kube-system namespace (chart allowlist)", func() { + // kube-system is included in modelharness's + // `allowedIngressNamespaces` defaults so cluster-control-plane + // components (kube-proxy health probes, konnectivity agent, + // kubelet exec proxy, etc.) can reach EPP / vLLM. If this + // assertion fails, the chart's `allow-inference-traffic` rule + // has regressed — verify charts/modelharness/values.yaml still + // lists "kube-system" under networkPolicy.allowedIngressNamespaces. state, out := probeClassified("kube-system") - Expect(state).To(Equal("blocked"), - "kube-system should be silently dropped by default-deny-ingress. state=%q output=%q", state, out) + Expect(state).To(Equal("connected"), + "kube-system should be allowed by allow-inference-traffic — "+ + "see networkPolicy.allowedIngressNamespaces in modelharness values.yaml. "+ + "state=%q output=%q", state, out) }) // ── Allow tests ───────────────────────────────────────────────────────