diff --git a/charts/modelharness/values.yaml b/charts/modelharness/values.yaml index 31acccc..e05b533 100644 --- a/charts/modelharness/values.yaml +++ b/charts/modelharness/values.yaml @@ -57,7 +57,7 @@ auth: # `gateway.networking.k8s.io/gateway-name` label) so North-South # traffic into the gateway is unaffected. networkPolicy: - enabled: false + enabled: true # allowedIngressNamespaces grants cross-namespace ingress to # non-gateway pods in this workload namespace, matched by the @@ -73,6 +73,8 @@ networkPolicy: # strict per-namespace isolation used by the network-policy e2e # cases (which assert that label-only spoofing across namespaces is # always denied). - allowedIngressNamespaces: [] - # - keda - # - kaito-system + allowedIngressNamespaces: + - keda + - kaito-system + - kube-system + - monitoring diff --git a/test/e2e/cases.go b/test/e2e/cases.go index 2d4e2e9..f1d1114 100644 --- a/test/e2e/cases.go +++ b/test/e2e/cases.go @@ -160,22 +160,20 @@ var CaseDeployments = map[string][]utils.ModelDeploymentValues{ }, CaseNetworkPolicyA: { { - Name: "netpol-a", - Namespace: "e2e-netpol-a", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "netpol-a", + Namespace: "e2e-netpol-a", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseNetworkPolicyB: { { - Name: "netpol-b", - Namespace: "e2e-netpol-b", - Model: presetPhi, - Replicas: 1, - InstanceType: "Standard_NV36ads_A10_v5", - NetworkPolicyEnabled: true, + Name: "netpol-b", + Namespace: "e2e-netpol-b", + Model: presetPhi, + Replicas: 1, + InstanceType: "Standard_NV36ads_A10_v5", }, }, CaseScaling: { @@ -194,16 +192,6 @@ var CaseDeployments = map[string][]utils.ModelDeploymentValues{ EnableScaling: true, MaxReplicas: 2, ScalingThreshold: 10, // low threshold to trigger scaling during tests - // Lock down East-West ingress while still letting - // keda-kaito-scaler (in the `keda` namespace) reach vLLM - // metric endpoints on shadow pods — without that allowance, - // KEDA can't observe vllm:num_requests_waiting and scale-up - // never fires. kaito-system is whitelisted as well so the - // gpu-node-mocker / kaito-workspace controllers retain - // optional direct-pod access for shadow-pod patching paths - // that don't go through the apiserver. - NetworkPolicyEnabled: true, - NetworkPolicyAllowedNamespaces: []string{"keda", "kaito-system"}, }, }, } @@ -247,7 +235,7 @@ func InstallCase(caseName string) string { ctx := context.Background() first := CaseDeployments[caseName][0] - Expect(utils.EnsureNamespace(ctx, ns, first.AuthAPIKeyEnabled, first.NetworkPolicyEnabled, first.NetworkPolicyAllowedNamespaces)).To(Succeed(), + Expect(utils.EnsureNamespace(ctx, ns, first.AuthAPIKeyEnabled)).To(Succeed(), "failed to ensure namespace %s for case %s", ns, caseName) Expect(utils.WaitForGatewayService(ctx, ns, gatewayName, utils.InferenceSetReadyTimeout)). diff --git a/test/e2e/network_policy_test.go b/test/e2e/network_policy_test.go index 3d74839..722cc30 100644 --- a/test/e2e/network_policy_test.go +++ b/test/e2e/network_policy_test.go @@ -38,6 +38,28 @@ import ( const ( probeTimeout = 10 * time.Second + + // probeImage is the agnhost image used by the upstream Kubernetes + // NetworkPolicy conformance suite. We use it instead of busybox + // because its `connect` subcommand returns documented, stable exit + // codes that distinguish "blocked" (silent drop / timeout, exit 1) + // from "refused" (TCP RST, exit 2) — letting the deny tests assert + // the *right* failure mode rather than treating any non-zero exit + // as success and silently passing if the EPP Service is misrouted. + // + // Pinned to the registry.k8s.io path (the k8s.gcr.io alias has been + // frozen since 2023) and a specific tag for reproducibility — the + // `latest` tag does not exist for this image. + probeImage = "registry.k8s.io/e2e-test-images/agnhost:2.47" + + // probeConnectTimeout is the per-probe TCP connect timeout passed + // to `agnhost connect`. Bumped from busybox `nc -w 3` to absorb + // Cilium identity-propagation lag on 3-node AKS clusters where the + // source pod and destination pod land on different nodes — the + // destination node may not have the source identity programmed + // into its policy map for several seconds after the source pod + // reports Ready. + probeConnectTimeout = "5s" ) var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func() { @@ -127,9 +149,10 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // Wait for NetworkPolicy enforcement to actually take effect on this // cluster. On freshly created Cilium clusters the policy maps may take // a few seconds to load even after pods report Ready. Use a single - // long-lived canary pod in an external namespace and probe with wget, - // which (unlike `nc -w` on busybox 1.36) sets a meaningful non-zero - // exit code when the connection is refused/blocked. + // long-lived canary pod in an external namespace and probe with + // `agnhost connect`, whose documented exit codes (1=blocked, + // 2=refused) let us distinguish enforced-deny from a misrouted + // destination — something busybox `nc -z` cannot do. canaryNS := fmt.Sprintf("e2e-netpol-canary-%d", rand.Intn(900000)+100000) probeNamespaces = append(probeNamespaces, canaryNS) _, _ = clientset.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ @@ -153,8 +176,12 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: "probe", - Image: "busybox:1.36", - Command: []string{"sh", "-c", "sleep 3600"}, + Image: probeImage, + // `agnhost pause` keeps the container alive + // indefinitely (equivalent to `sleep infinity`) + // without a shell wrapper, so the probe binary is + // PID 1 and ready to be exec'd against immediately. + Args: []string{"pause"}, }}, }, }, metav1.CreateOptions{}) @@ -173,14 +200,17 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func ) Eventually(func() bool { pollAttempts++ - // `nc -z -w 3` does a pure TCP probe and returns 0 on success, - // non-zero on refused/blocked/timeout. Echo the exit code to - // stdout because client-go's StreamWithContext returns nil even - // when the remote command exits non-zero, so we cannot rely on - // `err != nil` to mean "blocked". + // `agnhost connect` is a pure TCP probe with documented exit + // codes: 0 = connected, 1 = TIMEOUT (silent drop — what + // NetworkPolicy default-deny produces), 2 = REFUSED (TCP RST + // — destination unreachable but NOT blocked), 3 = DNS_ERROR, + // 4 = OTHER. Echo the exit code to stdout because client-go's + // StreamWithContext returns nil even when the remote command + // exits non-zero, so we cannot rely on `err != nil` to mean + // "blocked". cmd := []string{"sh", "-c", fmt.Sprintf( - "nc -z -w 3 %s %d 2>&1; echo EXIT=$?", - eppIP, eppPort, + "/agnhost connect %s:%d --timeout=%s --protocol=tcp 2>&1; echo EXIT=$?", + eppIP, eppPort, probeConnectTimeout, )} req := clientset.CoreV1().RESTClient().Post(). Resource("pods").Name(canaryPodName).Namespace(canaryNS). @@ -251,7 +281,7 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func "Cilium may not be enforcing policies on this cluster, or the "+ "allow-inference-traffic rule is too permissive\n"+ "polled %d times; %d probes saw EXIT=0 (canary reached the EPP pod)\n"+ - "last nc output: %q\nlast exec error: %q\n"+ + "last agnhost connect output: %q\nlast exec error: %q\n"+ "eppIP=%s eppPort=%d canaryNS=%s\n"+ "--- diagnostics ---\n%s", pollAttempts, connectedCount, @@ -271,10 +301,11 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func UninstallCase(CaseNetworkPolicyB) }) - // probeTarget launches a busybox pod in probeNS and execs the given command. - // It returns the stdout output and any exec error. The caller decides how to - // interpret the result (e.g. err==nil means connectivity, stdout content for - // HTTP response validation). Optional labels can be applied to the probe pod. + // probeTarget launches an agnhost pod in probeNS and execs the given + // command. It returns the stdout output and any exec error. The caller + // decides how to interpret the result (e.g. EXIT=0 in stdout means + // connectivity, EXIT=2 means the destination refused the connection). + // Optional labels can be applied to the probe pod. probeTarget := func(probeNS string, cmd []string, timeout time.Duration, labels map[string]string) (string, error) { // Track probe namespaces for cleanup (skip pre-existing ones). if probeNS != namespace && probeNS != "istio-system" && probeNS != "kube-system" && probeNS != "default" { @@ -311,7 +342,7 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // in mTLS and the EPP pod sees the source as 127.0.0.1 // (the local proxy), trivially matching // `allow-inference-traffic`'s intra-pod selector and - // bypassing NetworkPolicy entirely. A bare busybox probe + // bypassing NetworkPolicy entirely. A bare agnhost probe // is the only way to assert L3/L4 policy honestly. Annotations: map[string]string{ "sidecar.istio.io/inject": "false", @@ -319,9 +350,13 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func }, Spec: corev1.PodSpec{ Containers: []corev1.Container{{ - Name: "probe", - Image: "busybox:1.36", - Command: []string{"sh", "-c", "sleep 3600"}, + Name: "probe", + Image: probeImage, + // `agnhost pause` keeps the container alive + // indefinitely without a shell wrapper, so the probe + // binary is PID 1 and ready to be exec'd against + // immediately. + Args: []string{"pause"}, }}, }, } @@ -365,35 +400,76 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func return stdout.String(), err } - // ncCmd builds a TCP connectivity check using `nc -z` (zero I/O mode), - // which is the canonical probe form busybox 1.36's `nc` actually - // accepts (the bare `nc -w 3 HOST PORT` form is rejected with a usage - // banner). Exit code 0 ⇒ TCP handshake completed, non-zero ⇒ blocked, - // refused, or timed out — exactly what NetworkPolicy controls at L3/L4. + // connectCmd builds a TCP connectivity check using `agnhost connect`, + // the same probe binary the upstream Kubernetes NetworkPolicy + // conformance suite uses. Unlike busybox `nc -z`, agnhost has + // documented, stable exit codes that distinguish *why* a probe + // failed: + // 0 = connected + // 1 = TIMEOUT (silent drop — what NetworkPolicy default-deny does) + // 2 = REFUSED (TCP RST — destination is reachable but nothing is + // listening; means policy let the probe through and + // the test passing on this would be a silent bug) + // 3 = DNS_ERROR + // 4 = OTHER // - // We MUST echo `EXIT=$?` because client-go's StreamWithContext returns - // nil even when the remote command exits non-zero, so the caller cannot - // rely on `streamErr != nil` to mean "blocked" (this is the same - // quirk the canary loop in BeforeAll already handles). Callers should - // inspect stdout via `connected()` instead of trusting the returned err. - ncCmd := func(targetIP string, targetPort int32) []string { - return []string{"sh", "-c", fmt.Sprintf("nc -z -w 3 %s %d 2>&1; echo EXIT=$?", targetIP, targetPort)} + // We MUST echo `EXIT=$?` because client-go's StreamWithContext + // returns nil even when the remote command exits non-zero, so the + // caller cannot rely on `streamErr != nil` to mean "blocked". + // Callers should inspect stdout via `classify()` instead of trusting + // the returned err. + connectCmd := func(targetIP string, targetPort int32) []string { + return []string{"sh", "-c", fmt.Sprintf( + "/agnhost connect %s:%d --timeout=%s --protocol=tcp 2>&1; echo EXIT=$?", + targetIP, targetPort, probeConnectTimeout, + )} + } + + // classifyResult interprets the stdout/stderr of a connectCmd run via + // probeTarget into one of four states. "blocked" is what the deny + // tests expect — a NetworkPolicy default-deny silently drops the SYN + // and agnhost reports TIMEOUT (exit 1). "refused" means the SYN + // reached the destination's host but no listener answered, which is + // NOT a NetworkPolicy success and should fail the deny tests loud. + // "indeterminate" covers cases where the SPDY stream got torn down + // before agnhost printed an exit marker (rare; treated as + // not-connected by `connected()` but exposed separately so callers + // can distinguish "couldn't tell" from "definitely blocked"). + classifyResult := func(out string) string { + switch { + case strings.Contains(out, "EXIT=0"): + return "connected" + case strings.Contains(out, "EXIT=1"): + return "blocked" + case strings.Contains(out, "EXIT=2"): + return "refused" + case strings.Contains(out, "EXIT="): + return "other" + default: + return "indeterminate" + } } - // connected interprets the stdout/stderr of an ncCmd run via probeTarget. - // Returns true iff the embedded EXIT=0 marker is present, meaning the - // TCP handshake completed. + // connected returns true iff the probe completed a TCP handshake. + // Backwards-compatible shim for assertions that only care about the + // connected/not-connected boolean — prefer classifyResult() in new + // code so the deny tests can distinguish "blocked by policy" + // (the wanted outcome) from "refused by an empty endpoint" + // (a bug that policy did NOT block). connected := func(out string) bool { - return strings.Contains(out, "EXIT=0") + return classifyResult(out) == "connected" } - // probe is a convenience wrapper that checks TCP connectivity to the - // namespace-A EPP pod (see the `eppIP` declaration for why EPP, not - // the model pod, is the policy-enforcement sentinel under - // gpu-node-mocker). - probe := func(probeNS string) bool { - out, _ := probeTarget(probeNS, ncCmd(eppIP, eppPort), probeTimeout, nil) - return connected(out) + // probeClassified is a convenience wrapper that checks TCP connectivity + // to the namespace-A EPP pod (see the `eppIP` declaration for why EPP, + // not the model pod, is the policy-enforcement sentinel under + // gpu-node-mocker). Returns the classified result directly so callers + // can distinguish "blocked" (the wanted deny outcome) from "refused" + // (a bug where the destination is reachable but unlistening — policy + // did NOT block the probe). + probeClassified := func(probeNS string) (string, string) { + out, _ := probeTarget(probeNS, connectCmd(eppIP, eppPort), probeTimeout, nil) + return classifyResult(out), out } // ── Enforcement baseline ────────────────────────────────────────────── @@ -408,38 +484,53 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func // pipeline (which only the Gateway pod can reach), not the policy. // We probe the EPP pod (not the model pod) for the same reason the // deny tests do — see `eppIP` declaration. - out, _ := probeTarget(namespace, ncCmd(eppIP, eppPort), probeTimeout, nil) + out, _ := probeTarget(namespace, connectCmd(eppIP, eppPort), probeTimeout, nil) Expect(connected(out)).To(BeTrue(), - "intra-namespace TCP reach to EPP pod should succeed — if this fails, NetworkPolicy is over-blocking. nc output: %q", out) + "intra-namespace TCP reach to EPP pod should succeed — if this fails, NetworkPolicy is over-blocking. agnhost output: %q", out) }) It("baseline: should DENY ingress from an external namespace (proves enforcement is active)", func() { - allowed := probe("external-ns") - Expect(allowed).To(BeFalse(), - "external namespace reached model pod — NetworkPolicy enforcement is NOT active; "+ - "remaining deny tests are unreliable. Check that the CNI plugin supports "+ - "NetworkPolicy and that policies were applied correctly.") + state, out := probeClassified("external-ns") + Expect(state).To(Equal("blocked"), + "external namespace should be silently dropped by default-deny-ingress (TIMEOUT/EXIT=1). "+ + "Got state=%q output=%q. If state=='connected', NetworkPolicy enforcement is NOT active and "+ + "remaining deny tests are unreliable. If state=='refused', the EPP Service is misconfigured — "+ + "policy let the probe through but no listener answered.", state, out) }) // ── Deny tests ──────────────────────────────────────────────────────── It("should DENY ingress from a non-gateway pod in default namespace", func() { - Expect(probe("default")).To(BeFalse(), - "traffic from a non-gateway pod in default should be blocked") + state, out := probeClassified("default") + Expect(state).To(Equal("blocked"), + "non-gateway pod in default should be silently dropped. state=%q output=%q", state, out) }) It("should DENY ingress from istio-system namespace", func() { - Expect(probe("istio-system")).To(BeFalse(), - "traffic from istio-system should be blocked — only the gateway pod in default is allowed") + state, out := probeClassified("istio-system") + Expect(state).To(Equal("blocked"), + "istio-system should be silently dropped — only the gateway pod in default is allowed. state=%q output=%q", + state, out) }) It("should DENY ingress from a random namespace", func() { - Expect(probe("random-ns")).To(BeFalse(), - "traffic from random-ns should be blocked by default-deny-ingress") + state, out := probeClassified("random-ns") + Expect(state).To(Equal("blocked"), + "random-ns should be silently dropped by default-deny-ingress. state=%q output=%q", state, out) }) - It("should DENY ingress from kube-system namespace", func() { - Expect(probe("kube-system")).To(BeFalse(), - "traffic from kube-system should be blocked by default-deny-ingress") + It("should ALLOW ingress from kube-system namespace (chart allowlist)", func() { + // kube-system is included in modelharness's + // `allowedIngressNamespaces` defaults so cluster-control-plane + // components (kube-proxy health probes, konnectivity agent, + // kubelet exec proxy, etc.) can reach EPP / vLLM. If this + // assertion fails, the chart's `allow-inference-traffic` rule + // has regressed — verify charts/modelharness/values.yaml still + // lists "kube-system" under networkPolicy.allowedIngressNamespaces. + state, out := probeClassified("kube-system") + Expect(state).To(Equal("connected"), + "kube-system should be allowed by allow-inference-traffic — "+ + "see networkPolicy.allowedIngressNamespaces in modelharness values.yaml. "+ + "state=%q output=%q", state, out) }) // ── Allow tests ─────────────────────────────────────────────────────── @@ -453,23 +544,28 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func gatewayLabels := map[string]string{ "gateway.networking.k8s.io/gateway-name": "inference-gateway", } - out, _ := probeTarget("default", ncCmd(eppIP, eppPort), probeTimeout, gatewayLabels) - Expect(connected(out)).To(BeFalse(), - "gateway-labeled pod in default should NOT be allowed to TCP-connect to the EPP pod — "+ - "only the in-namespace gateway pod is a trusted ingress source. nc output: %q", out) + out, _ := probeTarget("default", connectCmd(eppIP, eppPort), probeTimeout, gatewayLabels) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "gateway-labeled pod in default should be silently dropped — only the in-namespace "+ + "gateway pod is a trusted ingress source. state=%q output=%q", state, out) }) // ── Cross-namespace isolation ───────────────────────────────────────── It("should DENY ingress from workload namespace A to workload namespace B", func() { - out, _ := probeTarget(namespace, ncCmd(eppIPB, eppPortB), probeTimeout, nil) - Expect(connected(out)).To(BeFalse(), - "workload namespace A should not be able to reach EPP pods in workload namespace B. nc output: %q", out) + out, _ := probeTarget(namespace, connectCmd(eppIPB, eppPortB), probeTimeout, nil) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "workload namespace A should be silently dropped by namespace B's default-deny. "+ + "state=%q output=%q", state, out) }) It("should DENY ingress from workload namespace B to workload namespace A", func() { - out, _ := probeTarget(namespaceB, ncCmd(eppIP, eppPort), probeTimeout, nil) - Expect(connected(out)).To(BeFalse(), - "workload namespace B should not be able to reach EPP pods in workload namespace A. nc output: %q", out) + out, _ := probeTarget(namespaceB, connectCmd(eppIP, eppPort), probeTimeout, nil) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "workload namespace B should be silently dropped by namespace A's default-deny. "+ + "state=%q output=%q", state, out) }) // Regression guard: a pod in namespace A that *spoofs* namespace B's @@ -483,10 +579,12 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func spoofedLabels := map[string]string{ "gateway.networking.k8s.io/gateway-name": fmt.Sprintf("%s-gateway", netpolModelB), } - out, _ := probeTarget(namespace, ncCmd(eppIPB, eppPortB), probeTimeout, spoofedLabels) - Expect(connected(out)).To(BeFalse(), - "a pod in namespace A carrying namespace B's gateway label must NOT reach EPP pods in B — "+ - "labels do not grant cross-namespace trust under the post-X1 policy. nc output: %q", out) + out, _ := probeTarget(namespace, connectCmd(eppIPB, eppPortB), probeTimeout, spoofedLabels) + state := classifyResult(out) + Expect(state).To(Equal("blocked"), + "a pod in namespace A carrying namespace B's gateway label must be silently dropped — "+ + "labels do not grant cross-namespace trust under the post-X1 policy. state=%q output=%q", + state, out) }) // ── North-South positive path ───────────────────────────────────────── @@ -516,11 +614,11 @@ var _ = Describe("Network Policy", utils.GinkgoLabelNetworkPolicy, Ordered, func Expect(gwPort).To(BeNumerically(">", 0), "gateway Service does not expose port 80") out, _ := probeTarget("e2e-netpol-external-client", - ncCmd(svc.Spec.ClusterIP, gwPort), probeTimeout, nil) + connectCmd(svc.Spec.ClusterIP, gwPort), probeTimeout, nil) Expect(connected(out)).To(BeTrue(), "external-namespace pod should be allowed to TCP-connect to the gateway pod via Service ClusterIP — "+ "if this fails, the workload-namespace NetworkPolicy is over-restrictive and is silently relying on "+ - "apiserver-mediated paths (port-forward / kubelet) for reachability. nc output: %q", out) + "apiserver-mediated paths (port-forward / kubelet) for reachability. agnhost output: %q", out) }) }) diff --git a/test/e2e/utils/helm.go b/test/e2e/utils/helm.go index 4ea1c75..1bd22e5 100644 --- a/test/e2e/utils/helm.go +++ b/test/e2e/utils/helm.go @@ -59,21 +59,6 @@ type ModelDeploymentValues struct { // EnsureNamespace; the warmup loop in SetupInferenceSetsWithRouting // reads the resulting Secret and sends Bearer + Host headers. AuthAPIKeyEnabled bool - // NetworkPolicyEnabled signals that this deployment's namespace - // should be locked down with the default-deny + allow-inference - // NetworkPolicy pair (provisioned by EnsureNamespace). All - // deployments in a case share a namespace, so the value on the first - // deployment is what takes effect. - NetworkPolicyEnabled bool - // NetworkPolicyAllowedNamespaces lists namespaces (matched by the - // standard `kubernetes.io/metadata.name` label) that are granted - // cross-namespace ingress to non-gateway pods when - // NetworkPolicyEnabled is true. Use this to permit control-plane - // scrapers — e.g. `keda-kaito-scaler` in `keda` needs to reach - // vLLM metrics on shadow pods to drive autoscaling decisions. - // Leave nil/empty to keep the namespace strictly isolated (the - // default for the network-policy e2e cases). - NetworkPolicyAllowedNamespaces []string } // DefaultModelDeploymentValues returns a populated ModelDeploymentValues for a @@ -192,16 +177,15 @@ func modelHarnessChartPath() string { // in `namespace`. It provisions the per-namespace Gateway (named // "-gw" by chart default), the catch-all model-not-found // HTTPRoute + ReferenceGrant, and — when authEnabled is true — the -// per-namespace AuthorizationPolicy + APIKey CR. When -// networkPolicyEnabled is true, the chart additionally renders the -// default-deny-ingress / allow-inference-traffic NetworkPolicies that +// per-namespace AuthorizationPolicy + APIKey CR. The chart always renders +// the default-deny-ingress / allow-inference-traffic NetworkPolicies that // lock down East-West ingress while keeping the gateway pod reachable; -// `allowedIngressNamespaces` (if non-empty) extends -// `allow-inference-traffic` with cross-namespace ingress for the named -// namespaces (matched by the `kubernetes.io/metadata.name` label). +// the chart-default `allowedIngressNamespaces` (currently +// keda + kaito-system) covers the control-plane scrapers every workload +// in this repo needs. // // Idempotent: re-running on an existing release reconciles the values. -func InstallModelHarness(namespace string, authEnabled, networkPolicyEnabled bool, allowedIngressNamespaces []string) error { +func InstallModelHarness(namespace string, authEnabled bool) error { if namespace == "" { return fmt.Errorf("modelharness: namespace is required") } @@ -221,11 +205,6 @@ func InstallModelHarness(namespace string, authEnabled, networkPolicyEnabled boo "--create-namespace", "--set", "namespace=" + namespace, "--set", "auth.enabled=" + strconv.FormatBool(authEnabled), - "--set", "networkPolicy.enabled=" + strconv.FormatBool(networkPolicyEnabled), - } - for i, allowed := range allowedIngressNamespaces { - args = append(args, "--set", - fmt.Sprintf("networkPolicy.allowedIngressNamespaces[%d]=%s", i, allowed)) } args = append(args, "--wait") diff --git a/test/e2e/utils/setup.go b/test/e2e/utils/setup.go index 6b5bfbb..56117de 100644 --- a/test/e2e/utils/setup.go +++ b/test/e2e/utils/setup.go @@ -36,20 +36,19 @@ import ( // default), the catch-all model-not-found HTTPRoute + ReferenceGrant, // — when authEnabled is true — the AuthorizationPolicy + APIKey CR // that wire the Gateway into the cluster-wide apikey-ext-authz CUSTOM -// provider, and — when networkPolicyEnabled is true — the -// default-deny-ingress + allow-inference-traffic NetworkPolicies that -// lock down East-West ingress while keeping the per-namespace gateway -// pod reachable from outside the namespace (matched via the standard -// `gateway.networking.k8s.io/gateway-name` label that Istio stamps on -// every gateway pod). `npAllowedNamespaces` (only honored when -// networkPolicyEnabled is true) grants cross-namespace ingress to -// non-gateway pods for the named namespaces — required for control-plane -// scrapers like `keda-kaito-scaler` that live outside the workload -// namespace. +// provider, and the default-deny-ingress + allow-inference-traffic +// NetworkPolicies that lock down East-West ingress while keeping the +// per-namespace gateway pod reachable from outside the namespace +// (matched via the standard `gateway.networking.k8s.io/gateway-name` +// label that Istio stamps on every gateway pod). The chart-default +// `allowedIngressNamespaces` (currently keda + kaito-system) covers +// the control-plane scrapers — `keda-kaito-scaler` and the +// gpu-node-mocker / kaito-workspace controllers — that need to reach +// shadow pods directly from outside the workload namespace. // // Safe to call repeatedly; the underlying `helm upgrade --install` and // namespace Create are both idempotent. -func EnsureNamespace(ctx context.Context, name string, authEnabled, networkPolicyEnabled bool, npAllowedNamespaces []string) error { +func EnsureNamespace(ctx context.Context, name string, authEnabled bool) error { GetClusterClient(TestingCluster) cl := TestingCluster.KubeClient ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: name}} @@ -57,7 +56,7 @@ func EnsureNamespace(ctx context.Context, name string, authEnabled, networkPolic return fmt.Errorf("create namespace %s: %w", name, err) } - if err := InstallModelHarness(name, authEnabled, networkPolicyEnabled, npAllowedNamespaces); err != nil { + if err := InstallModelHarness(name, authEnabled); err != nil { return fmt.Errorf("install modelharness in %s: %w", name, err) }