@@ -4,37 +4,32 @@ name: ModelDeployment benchmark
44# 1. Reuses the E2E base setup composite action to bring up the AKS
55# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
66# llm-gateway-apikey, gpu-node-mocker, KAITO).
7- # 2. Installs the modeldeployment Helm chart in the `default` namespace,
8- # reusing the cluster-wide `inference-gateway` Gateway and the
9- # cluster-wide `default` APIKey (Secret `llm-api-key`).
10- # 3. Starts a `kubectl port-forward` against the gateway service so the
7+ # 2. Provisions a dedicated workload namespace via the
8+ # `charts/modelharness` Helm chart (per-namespace Istio Gateway
9+ # `<namespace>-gw`, catch-all HTTPRoute + ReferenceGrant pointing
10+ # at the cluster-shared `default/model-not-found` Service, plus
11+ # the per-namespace `AuthorizationPolicy` + `APIKey` CR when
12+ # `auth.enabled=true`).
13+ # 3. Installs the `modeldeployment` Helm chart in the same workload
14+ # namespace, parented to that namespace's Gateway and protected by
15+ # the namespace-local APIKey (Secret `llm-api-key`).
16+ # 4. Starts a `kubectl port-forward` against the gateway service so the
1117# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
1218# exercised from localhost. The endpoint URL and API key are echoed
1319# to the workflow log.
14- # 4 . Installs guidellm and runs a sweep benchmark against the endpoint,
20+ # 5 . Installs guidellm and runs a sweep benchmark against the endpoint,
1521# printing TTFT / TPOT / TPM (and the rest of the guidellm console
1622# summary) into the workflow log.
1723#
1824# All component versions and benchmark parameters are hard-coded in the
1925# `env:` block below — there are no workflow_dispatch inputs. To change a
20- # value, edit this file ( and rely on the pull_request trigger to validate
21- # the change end-to-end ).
26+ # value, edit this file and trigger the workflow manually from the
27+ # Actions tab ( the workflow is dispatch-only ).
2228
2329on :
24- # Run on PRs that touch the benchmark workflow or anything it exercises
25- # (modeldeployment chart, gpu-node-mocker, e2e composite action) so the
26- # full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from
27- # a PR.
28- pull_request :
29- branches : [ main ]
30- paths :
31- - ' .github/workflows/benchmark.yaml'
32- - ' .github/actions/e2e-base-setup/**'
33- - ' charts/modeldeployment/**'
34- - ' cmd/gpu-node-mocker/**'
35- - ' pkg/gpu-node-mocker/**'
36- - ' docker/Dockerfile'
37- - ' versions.env'
30+ # Manual-only: this benchmark spins up a real AKS cluster and runs
31+ # guidellm against it, so it must not run automatically on every PR
32+ # or push. Trigger from the Actions tab via "Run workflow".
3833 workflow_dispatch :
3934
4035env :
4540 LOCATION : swedencentral
4641 NODE_COUNT : ' 3'
4742 NODE_VM_SIZE : Standard_D8s_v5
48- GATEWAY_NAME : inference-gateway
49- GATEWAY_NAMESPACE : default
43+ # Dedicated workload namespace provisioned by charts/modelharness.
44+ # Naming: per-namespace Gateway becomes "<namespace>-gw" by chart
45+ # default; the per-namespace APIKey Secret (llm-api-key) lives here.
46+ WORKLOAD_NAMESPACE : benchmark
47+ GATEWAY_NAME : benchmark-gw
5048 GATEWAY_LOCAL_PORT : ' 18080'
5149 # The llm-gateway-apikey ext_authz filter resolves the APIKey CR's
5250 # namespace from the request's Host header subdomain (
5553 # "cannot determine gateway namespace: set context_extensions[gateway-namespace]
5654 # or use subdomain-based host". We map this FQDN to 127.0.0.1 in
5755 # /etc/hosts so the kubectl port-forward target carries the correct Host.
58- GATEWAY_HOST : default .gw.kaito.sh
56+ GATEWAY_HOST : benchmark .gw.kaito.sh
5957 APIKEY_SECRET_NAME : llm-api-key
6058 APIKEY_SECRET_KEY : apiKey
6159 # ----- Benchmark parameters (previously workflow_dispatch inputs) -----
@@ -153,22 +151,44 @@ jobs:
153151 keda-kaito-scaler-version : ' '
154152 llm-gateway-auth-version : ' '
155153
154+ - name : Provision workload namespace via modelharness
155+ run : |
156+ set -euo pipefail
157+ echo "── Installing modelharness chart in ${WORKLOAD_NAMESPACE} ──"
158+ # The chart provisions:
159+ # - per-namespace Istio Gateway "<namespace>-gw"
160+ # - catch-all HTTPRoute → cluster-shared `default/model-not-found`
161+ # - ReferenceGrant authorising that cross-ns backendRef
162+ # - AuthorizationPolicy wiring the Gateway pod into the
163+ # cluster-wide `apikey-ext-authz` CUSTOM provider
164+ # - APIKey CR `default` (apikey-operator reconciles it into
165+ # Secret `llm-api-key` in this namespace)
166+ # `--create-namespace` lets helm create the workload namespace
167+ # on first install — no separate `kubectl create namespace`
168+ # step needed.
169+ helm upgrade --install modelharness charts/modelharness \
170+ --namespace "${WORKLOAD_NAMESPACE}" \
171+ --create-namespace \
172+ --set namespace="${WORKLOAD_NAMESPACE}" \
173+ --set auth.enabled=true \
174+ --wait --timeout=5m
175+
156176 - name : Deploy inference workload via ModelDeployment
157177 run : |
158178 set -euo pipefail
159179 echo "── Installing modeldeployment chart ──"
160180 echo " release / name : ${DEPLOYMENT_NAME}"
161- echo " namespace : ${GATEWAY_NAMESPACE }"
181+ echo " namespace : ${WORKLOAD_NAMESPACE }"
162182 echo " preset model : ${MODEL_PRESET}"
163183 echo " replicas : ${REPLICAS}"
164184 echo " instance type : ${INSTANCE_TYPE}"
165- echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE })"
185+ echo " gateway : ${GATEWAY_NAME} (namespace ${WORKLOAD_NAMESPACE })"
166186
167187 helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
168- --namespace "${GATEWAY_NAMESPACE }" \
188+ --namespace "${WORKLOAD_NAMESPACE }" \
169189 --create-namespace \
170190 --set name="${DEPLOYMENT_NAME}" \
171- --set namespace="${GATEWAY_NAMESPACE }" \
191+ --set namespace="${WORKLOAD_NAMESPACE }" \
172192 --set model="${MODEL_PRESET}" \
173193 --set replicas="${REPLICAS}" \
174194 --set instanceType="${INSTANCE_TYPE}" \
@@ -184,15 +204,15 @@ jobs:
184204 echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"
185205
186206 # Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
187- kubectl -n "${GATEWAY_NAMESPACE }" rollout status \
207+ kubectl -n "${WORKLOAD_NAMESPACE }" rollout status \
188208 "deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m
189209
190210 # Wait for the inference (shadow) pods. KAITO labels each
191211 # inference pod with `inferenceset.kaito.sh/created-by=<name>`.
192212 deadline=$(( $(date +%s) + 600 ))
193213 ready=0
194214 while [ "$(date +%s)" -lt "${deadline}" ]; do
195- ready=$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
215+ ready=$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
196216 -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
197217 -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
198218 2>/dev/null | grep -c '^True$' || true)
@@ -205,23 +225,23 @@ jobs:
205225 done
206226 if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
207227 echo "✗ inference pods did not all become Ready within 10m"
208- kubectl -n "${GATEWAY_NAMESPACE }" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
228+ kubectl -n "${WORKLOAD_NAMESPACE }" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
209229 exit 1
210230 fi
211231
212232 # ---------- 2. Wait for the default APIKey Secret ----------
213- echo "── Waiting for Secret ${GATEWAY_NAMESPACE }/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
233+ echo "── Waiting for Secret ${WORKLOAD_NAMESPACE }/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
214234 secret_found=0
215235 for _ in $(seq 1 60); do
216- if kubectl -n "${GATEWAY_NAMESPACE }" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
236+ if kubectl -n "${WORKLOAD_NAMESPACE }" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
217237 echo "✓ Secret found"
218238 secret_found=1
219239 break
220240 fi
221241 sleep 5
222242 done
223243 if [ "${secret_found}" -ne 1 ]; then
224- echo "✗ Secret ${GATEWAY_NAMESPACE }/${APIKEY_SECRET_NAME} did not appear within 5m"
244+ echo "✗ Secret ${WORKLOAD_NAMESPACE }/${APIKEY_SECRET_NAME} did not appear within 5m"
225245 exit 1
226246 fi
227247
@@ -246,7 +266,7 @@ jobs:
246266 # Start kubectl port-forward in the background. Output goes to a
247267 # log file so the workflow can dump it on failure.
248268 mkdir -p "${PF_DIR}"
249- nohup kubectl -n "${GATEWAY_NAMESPACE }" port-forward \
269+ nohup kubectl -n "${WORKLOAD_NAMESPACE }" port-forward \
250270 "svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
251271 > "${PF_DIR}/port-forward.log" 2>&1 &
252272 PF_PID=$!
@@ -267,7 +287,7 @@ jobs:
267287 exit 1
268288 fi
269289
270- API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE }" get secret \
290+ API_KEY="$(kubectl -n "${WORKLOAD_NAMESPACE }" get secret \
271291 "${APIKEY_SECRET_NAME}" \
272292 -o jsonpath="{.data.${APIKEY_SECRET_KEY}}" | base64 -d)"
273293 # Use the subdomain-based FQDN (mapped to 127.0.0.1 above) so the
@@ -387,7 +407,7 @@ jobs:
387407 echo "| --- | --- |"
388408 echo "| model preset | \`${MODEL_PRESET}\` |"
389409 echo "| deployment | \`${DEPLOYMENT_NAME}\` |"
390- echo "| namespace | \`${GATEWAY_NAMESPACE }\` |"
410+ echo "| namespace | \`${WORKLOAD_NAMESPACE }\` |"
391411 echo "| replicas | ${REPLICAS} |"
392412 echo "| instance type | \`${INSTANCE_TYPE}\` |"
393413 echo "| guidellm profile | \`${BENCHMARK_PROFILE}\` |"
@@ -449,14 +469,14 @@ jobs:
449469 # on a real node and labelled `kaito.sh/managed-by=gpu-mocker` /
450470 # `kaito.sh/shadow-pod-for=<ns>.<original-name>`. Resolve each
451471 # original pod to its shadow before scraping /metrics.
452- ORIGINAL_PODS="$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
472+ ORIGINAL_PODS="$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
453473 -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
454474 -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
455475
456476 PODS=""
457477 for ORIG in ${ORIGINAL_PODS}; do
458- SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${GATEWAY_NAMESPACE }.${ORIG}"
459- SHADOW_POD="$(kubectl -n "${GATEWAY_NAMESPACE }" get pods \
478+ SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${WORKLOAD_NAMESPACE }.${ORIG}"
479+ SHADOW_POD="$(kubectl -n "${WORKLOAD_NAMESPACE }" get pods \
460480 -l "${SHADOW_SELECTOR}" \
461481 -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
462482 if [ -z "${SHADOW_POD}" ]; then
@@ -500,8 +520,8 @@ jobs:
500520 for POD in ${PODS}; do
501521 RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
502522 echo "── ${POD} ──"
503- if ! kubectl -n "${GATEWAY_NAMESPACE }" get \
504- --raw "/api/v1/namespaces/${GATEWAY_NAMESPACE }/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
523+ if ! kubectl -n "${WORKLOAD_NAMESPACE }" get \
524+ --raw "/api/v1/namespaces/${WORKLOAD_NAMESPACE }/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
505525 > "${RAW}" 2>"${SCRAPE_ERR}"; then
506526 echo " ✗ failed to scrape /metrics:"
507527 sed 's/^/ /' "${SCRAPE_ERR}" || true
@@ -590,14 +610,23 @@ jobs:
590610
591611 # ---------- 2. Uninstall modeldeployment Helm release ----------
592612 helm uninstall "${DEPLOYMENT_NAME}" \
593- --namespace "${GATEWAY_NAMESPACE}" \
613+ --namespace "${WORKLOAD_NAMESPACE}" \
614+ --ignore-not-found --wait || true
615+
616+ # ---------- 3. Uninstall modelharness Helm release ----------
617+ helm uninstall modelharness \
618+ --namespace "${WORKLOAD_NAMESPACE}" \
594619 --ignore-not-found --wait || true
595620
596- # ---------- 3. Dump cluster state on failure ----------
621+ # ---------- 4. Delete workload namespace ----------
622+ kubectl delete namespace "${WORKLOAD_NAMESPACE}" \
623+ --ignore-not-found --wait=false || true
624+
625+ # ---------- 5. Dump cluster state on failure ----------
597626 if [ "${JOB_STATUS}" = "failure" ]; then
598627 echo "── Job failed — dumping cluster state ──"
599628 make e2e-dump || true
600629 fi
601630
602- # ---------- 4 . Teardown cluster ----------
631+ # ---------- 6 . Teardown cluster ----------
603632 make e2e-teardown || true
0 commit comments