@@ -4,14 +4,20 @@ name: ModelDeployment benchmark
44# 1. Reuses the E2E base setup composite action to bring up the AKS
55# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
66# llm-gateway-apikey, gpu-node-mocker, KAITO).
7- # 2. Installs the modeldeployment Helm chart in the `default` namespace,
8- # reusing the cluster-wide `inference-gateway` Gateway and the
9- # cluster-wide `default` APIKey (Secret `llm-api-key`).
10- # 3. Starts a `kubectl port-forward` against the gateway service so the
7+ # 2. Provisions a dedicated workload namespace via the
8+ # `charts/modelharness` Helm chart (per-namespace Istio Gateway
9+ # `<namespace>-gw`, catch-all HTTPRoute + ReferenceGrant pointing
10+ # at the cluster-shared `default/model-not-found` Service, plus
11+ # the per-namespace `AuthorizationPolicy` + `APIKey` CR when
12+ # `auth.enabled=true`).
13+ # 3. Installs the `modeldeployment` Helm chart in the same workload
14+ # namespace, parented to that namespace's Gateway and protected by
15+ # the namespace-local APIKey (Secret `llm-api-key`).
16+ # 4. Starts a `kubectl port-forward` against the gateway service so the
1117# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
1218# exercised from localhost. The endpoint URL and API key are echoed
1319# to the workflow log.
14- # 4 . Installs guidellm and runs a sweep benchmark against the endpoint,
20+ # 5 . Installs guidellm and runs a sweep benchmark against the endpoint,
1521# printing TTFT / TPOT / TPM (and the rest of the guidellm console
1622# summary) into the workflow log.
1723#
4551 LOCATION : swedencentral
4652 NODE_COUNT : ' 3'
4753 NODE_VM_SIZE : Standard_D8s_v5
48- GATEWAY_NAME : inference-gateway
49- GATEWAY_NAMESPACE : default
54+ # Dedicated workload namespace provisioned by charts/modelharness.
55+ # Naming: per-namespace Gateway becomes "<namespace>-gw" by chart
56+ # default; the per-namespace APIKey Secret (llm-api-key) lives here.
57+ WORKLOAD_NAMESPACE : benchmark
58+ GATEWAY_NAME : benchmark-gw
5059 GATEWAY_LOCAL_PORT : ' 18080'
5160 # The llm-gateway-apikey ext_authz filter resolves the APIKey CR's
5261 # namespace from the request's Host header subdomain (
5564 # "cannot determine gateway namespace: set context_extensions[gateway-namespace]
5665 # or use subdomain-based host". We map this FQDN to 127.0.0.1 in
5766 # /etc/hosts so the kubectl port-forward target carries the correct Host.
58- GATEWAY_HOST : default .gw.kaito.sh
67+ GATEWAY_HOST : benchmark .gw.kaito.sh
5968 APIKEY_SECRET_NAME : llm-api-key
6069 APIKEY_SECRET_KEY : apiKey
6170 # ----- Benchmark parameters (previously workflow_dispatch inputs) -----
@@ -153,22 +162,44 @@ jobs:
153162 keda-kaito-scaler-version : ' '
154163 llm-gateway-auth-version : ' '
155164
165+ - name : Provision workload namespace via modelharness
166+ run : |
167+ set -euo pipefail
168+ echo "── Installing modelharness chart in ${WORKLOAD_NAMESPACE} ──"
169+ # The chart provisions:
170+ # - per-namespace Istio Gateway "<namespace>-gw"
171+ # - catch-all HTTPRoute → cluster-shared `default/model-not-found`
172+ # - ReferenceGrant authorising that cross-ns backendRef
173+ # - AuthorizationPolicy wiring the Gateway pod into the
174+ # cluster-wide `apikey-ext-authz` CUSTOM provider
175+ # - APIKey CR `default` (apikey-operator reconciles it into
176+ # Secret `llm-api-key` in this namespace)
177+ # `--create-namespace` lets helm create the workload namespace
178+ # on first install — no separate `kubectl create namespace`
179+ # step needed.
180+ helm upgrade --install modelharness charts/modelharness \
181+ --namespace "${WORKLOAD_NAMESPACE}" \
182+ --create-namespace \
183+ --set namespace="${WORKLOAD_NAMESPACE}" \
184+ --set auth.enabled=true \
185+ --wait --timeout=5m
186+
156187 - name : Deploy inference workload via ModelDeployment
157188 run : |
158189 set -euo pipefail
159190 echo "── Installing modeldeployment chart ──"
160191 echo " release / name : ${DEPLOYMENT_NAME}"
161- echo " namespace : ${GATEWAY_NAMESPACE }"
192+ echo " namespace : ${WORKLOAD_NAMESPACE }"
162193 echo " preset model : ${MODEL_PRESET}"
163194 echo " replicas : ${REPLICAS}"
164195 echo " instance type : ${INSTANCE_TYPE}"
165- echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE })"
196+ echo " gateway : ${GATEWAY_NAME} (namespace ${WORKLOAD_NAMESPACE })"
166197
167198 helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
168- --namespace "${GATEWAY_NAMESPACE }" \
199+ --namespace "${WORKLOAD_NAMESPACE }" \
169200 --create-namespace \
170201 --set name="${DEPLOYMENT_NAME}" \
171- --set namespace="${GATEWAY_NAMESPACE }" \
202+ --set namespace="${WORKLOAD_NAMESPACE }" \
172203 --set model="${MODEL_PRESET}" \
173204 --set replicas="${REPLICAS}" \
174205 --set instanceType="${INSTANCE_TYPE}" \
@@ -184,15 +215,15 @@ jobs:
184215 echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"
185216
186217 # Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
187- kubectl -n "${GATEWAY_NAMESPACE }" rollout status \
218+ kubectl -n "${WORKLOAD_NAMESPACE }" rollout status \
188219 "deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m
189220
190221 # Wait for the inference (shadow) pods. KAITO labels each
191222 # inference pod with `inferenceset.kaito.sh/created-by=<name>`.
192223 deadline=$(( $(date +%s) + 600 ))
193224 ready=0
194225 while [ "$(date +%s)" -lt "${deadline}" ]; do
195- ready=$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
226+ ready=$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
196227 -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
197228 -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
198229 2>/dev/null | grep -c '^True$' || true)
@@ -205,23 +236,23 @@ jobs:
205236 done
206237 if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
207238 echo "✗ inference pods did not all become Ready within 10m"
208- kubectl -n "${GATEWAY_NAMESPACE }" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
239+ kubectl -n "${WORKLOAD_NAMESPACE }" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
209240 exit 1
210241 fi
211242
212243 # ---------- 2. Wait for the default APIKey Secret ----------
213- echo "── Waiting for Secret ${GATEWAY_NAMESPACE }/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
244+ echo "── Waiting for Secret ${WORKLOAD_NAMESPACE }/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
214245 secret_found=0
215246 for _ in $(seq 1 60); do
216- if kubectl -n "${GATEWAY_NAMESPACE }" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
247+ if kubectl -n "${WORKLOAD_NAMESPACE }" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
217248 echo "✓ Secret found"
218249 secret_found=1
219250 break
220251 fi
221252 sleep 5
222253 done
223254 if [ "${secret_found}" -ne 1 ]; then
224- echo "✗ Secret ${GATEWAY_NAMESPACE }/${APIKEY_SECRET_NAME} did not appear within 5m"
255+ echo "✗ Secret ${WORKLOAD_NAMESPACE }/${APIKEY_SECRET_NAME} did not appear within 5m"
225256 exit 1
226257 fi
227258
@@ -246,7 +277,7 @@ jobs:
246277 # Start kubectl port-forward in the background. Output goes to a
247278 # log file so the workflow can dump it on failure.
248279 mkdir -p "${PF_DIR}"
249- nohup kubectl -n "${GATEWAY_NAMESPACE }" port-forward \
280+ nohup kubectl -n "${WORKLOAD_NAMESPACE }" port-forward \
250281 "svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
251282 > "${PF_DIR}/port-forward.log" 2>&1 &
252283 PF_PID=$!
@@ -267,7 +298,7 @@ jobs:
267298 exit 1
268299 fi
269300
270- API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE }" get secret \
301+ API_KEY="$(kubectl -n "${WORKLOAD_NAMESPACE }" get secret \
271302 "${APIKEY_SECRET_NAME}" \
272303 -o jsonpath="{.data.${APIKEY_SECRET_KEY}}" | base64 -d)"
273304 # Use the subdomain-based FQDN (mapped to 127.0.0.1 above) so the
@@ -387,7 +418,7 @@ jobs:
387418 echo "| --- | --- |"
388419 echo "| model preset | \`${MODEL_PRESET}\` |"
389420 echo "| deployment | \`${DEPLOYMENT_NAME}\` |"
390- echo "| namespace | \`${GATEWAY_NAMESPACE }\` |"
421+ echo "| namespace | \`${WORKLOAD_NAMESPACE }\` |"
391422 echo "| replicas | ${REPLICAS} |"
392423 echo "| instance type | \`${INSTANCE_TYPE}\` |"
393424 echo "| guidellm profile | \`${BENCHMARK_PROFILE}\` |"
@@ -449,14 +480,14 @@ jobs:
449480 # on a real node and labelled `kaito.sh/managed-by=gpu-mocker` /
450481 # `kaito.sh/shadow-pod-for=<ns>.<original-name>`. Resolve each
451482 # original pod to its shadow before scraping /metrics.
452- ORIGINAL_PODS="$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
483+ ORIGINAL_PODS="$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
453484 -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
454485 -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
455486
456487 PODS=""
457488 for ORIG in ${ORIGINAL_PODS}; do
458- SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${GATEWAY_NAMESPACE }.${ORIG}"
459- SHADOW_POD="$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
489+ SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${WORKLOAD_NAMESPACE }.${ORIG}"
490+ SHADOW_POD="$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
460491 -l "${SHADOW_SELECTOR}" \
461492 -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
462493 if [ -z "${SHADOW_POD}" ]; then
@@ -500,8 +531,8 @@ jobs:
500531 for POD in ${PODS}; do
501532 RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
502533 echo "── ${POD} ──"
503- if ! kubectl -n "${GATEWAY_NAMESPACE }" get \
504- --raw "/api/v1/namespaces/${GATEWAY_NAMESPACE }/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
534+ if ! kubectl -n "${WORKLOAD_NAMESPACE }" get \
535+ --raw "/api/v1/namespaces/${WORKLOAD_NAMESPACE }/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
505536 > "${RAW}" 2>"${SCRAPE_ERR}"; then
506537 echo " ✗ failed to scrape /metrics:"
507538 sed 's/^/ /' "${SCRAPE_ERR}" || true
@@ -590,14 +621,23 @@ jobs:
590621
591622 # ---------- 2. Uninstall modeldeployment Helm release ----------
592623 helm uninstall "${DEPLOYMENT_NAME}" \
593- --namespace "${GATEWAY_NAMESPACE}" \
624+ --namespace "${WORKLOAD_NAMESPACE}" \
625+ --ignore-not-found --wait || true
626+
627+ # ---------- 3. Uninstall modelharness Helm release ----------
628+ helm uninstall modelharness \
629+ --namespace "${WORKLOAD_NAMESPACE}" \
594630 --ignore-not-found --wait || true
595631
596- # ---------- 3. Dump cluster state on failure ----------
632+ # ---------- 4. Delete workload namespace ----------
633+ kubectl delete namespace "${WORKLOAD_NAMESPACE}" \
634+ --ignore-not-found --wait=false || true
635+
636+ # ---------- 5. Dump cluster state on failure ----------
597637 if [ "${JOB_STATUS}" = "failure" ]; then
598638 echo "── Job failed — dumping cluster state ──"
599639 make e2e-dump || true
600640 fi
601641
602- # ---------- 4 . Teardown cluster ----------
642+ # ---------- 6 . Teardown cluster ----------
603643 make e2e-teardown || true
0 commit comments