Skip to content

Commit 36ca2a4

Browse files
authored
feat: add modelharness helm chart (#46)
Signed-off-by: rambohe-ch <rambohe.ch@gmail.com>
1 parent 7ef9e3b commit 36ca2a4

30 files changed

Lines changed: 633 additions & 794 deletions

.github/workflows/benchmark.yaml

Lines changed: 74 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,32 @@ name: ModelDeployment benchmark
44
# 1. Reuses the E2E base setup composite action to bring up the AKS
55
# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
66
# llm-gateway-apikey, gpu-node-mocker, KAITO).
7-
# 2. Installs the modeldeployment Helm chart in the `default` namespace,
8-
# reusing the cluster-wide `inference-gateway` Gateway and the
9-
# cluster-wide `default` APIKey (Secret `llm-api-key`).
10-
# 3. Starts a `kubectl port-forward` against the gateway service so the
7+
# 2. Provisions a dedicated workload namespace via the
8+
# `charts/modelharness` Helm chart (per-namespace Istio Gateway
9+
# `<namespace>-gw`, catch-all HTTPRoute + ReferenceGrant pointing
10+
# at the cluster-shared `default/model-not-found` Service, plus
11+
# the per-namespace `AuthorizationPolicy` + `APIKey` CR when
12+
# `auth.enabled=true`).
13+
# 3. Installs the `modeldeployment` Helm chart in the same workload
14+
# namespace, parented to that namespace's Gateway and protected by
15+
# the namespace-local APIKey (Secret `llm-api-key`).
16+
# 4. Starts a `kubectl port-forward` against the gateway service so the
1117
# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
1218
# exercised from localhost. The endpoint URL and API key are echoed
1319
# to the workflow log.
14-
# 4. Installs guidellm and runs a sweep benchmark against the endpoint,
20+
# 5. Installs guidellm and runs a sweep benchmark against the endpoint,
1521
# printing TTFT / TPOT / TPM (and the rest of the guidellm console
1622
# summary) into the workflow log.
1723
#
1824
# All component versions and benchmark parameters are hard-coded in the
1925
# `env:` block below — there are no workflow_dispatch inputs. To change a
20-
# value, edit this file (and rely on the pull_request trigger to validate
21-
# the change end-to-end).
26+
# value, edit this file and trigger the workflow manually from the
27+
# Actions tab (the workflow is dispatch-only).
2228

2329
on:
24-
# Run on PRs that touch the benchmark workflow or anything it exercises
25-
# (modeldeployment chart, gpu-node-mocker, e2e composite action) so the
26-
# full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from
27-
# a PR.
28-
pull_request:
29-
branches: [ main ]
30-
paths:
31-
- '.github/workflows/benchmark.yaml'
32-
- '.github/actions/e2e-base-setup/**'
33-
- 'charts/modeldeployment/**'
34-
- 'cmd/gpu-node-mocker/**'
35-
- 'pkg/gpu-node-mocker/**'
36-
- 'docker/Dockerfile'
37-
- 'versions.env'
30+
# Manual-only: this benchmark spins up a real AKS cluster and runs
31+
# guidellm against it, so it must not run automatically on every PR
32+
# or push. Trigger from the Actions tab via "Run workflow".
3833
workflow_dispatch:
3934

4035
env:
@@ -45,8 +40,11 @@ env:
4540
LOCATION: swedencentral
4641
NODE_COUNT: '3'
4742
NODE_VM_SIZE: Standard_D8s_v5
48-
GATEWAY_NAME: inference-gateway
49-
GATEWAY_NAMESPACE: default
43+
# Dedicated workload namespace provisioned by charts/modelharness.
44+
# Naming: per-namespace Gateway becomes "<namespace>-gw" by chart
45+
# default; the per-namespace APIKey Secret (llm-api-key) lives here.
46+
WORKLOAD_NAMESPACE: benchmark
47+
GATEWAY_NAME: benchmark-gw
5048
GATEWAY_LOCAL_PORT: '18080'
5149
# The llm-gateway-apikey ext_authz filter resolves the APIKey CR's
5250
# namespace from the request's Host header subdomain (
@@ -55,7 +53,7 @@ env:
5553
# "cannot determine gateway namespace: set context_extensions[gateway-namespace]
5654
# or use subdomain-based host". We map this FQDN to 127.0.0.1 in
5755
# /etc/hosts so the kubectl port-forward target carries the correct Host.
58-
GATEWAY_HOST: default.gw.kaito.sh
56+
GATEWAY_HOST: benchmark.gw.kaito.sh
5957
APIKEY_SECRET_NAME: llm-api-key
6058
APIKEY_SECRET_KEY: apiKey
6159
# ----- Benchmark parameters (previously workflow_dispatch inputs) -----
@@ -153,22 +151,44 @@ jobs:
153151
keda-kaito-scaler-version: ''
154152
llm-gateway-auth-version: ''
155153

154+
- name: Provision workload namespace via modelharness
155+
run: |
156+
set -euo pipefail
157+
echo "── Installing modelharness chart in ${WORKLOAD_NAMESPACE} ──"
158+
# The chart provisions:
159+
# - per-namespace Istio Gateway "<namespace>-gw"
160+
# - catch-all HTTPRoute → cluster-shared `default/model-not-found`
161+
# - ReferenceGrant authorising that cross-ns backendRef
162+
# - AuthorizationPolicy wiring the Gateway pod into the
163+
# cluster-wide `apikey-ext-authz` CUSTOM provider
164+
# - APIKey CR `default` (apikey-operator reconciles it into
165+
# Secret `llm-api-key` in this namespace)
166+
# `--create-namespace` lets helm create the workload namespace
167+
# on first install — no separate `kubectl create namespace`
168+
# step needed.
169+
helm upgrade --install modelharness charts/modelharness \
170+
--namespace "${WORKLOAD_NAMESPACE}" \
171+
--create-namespace \
172+
--set namespace="${WORKLOAD_NAMESPACE}" \
173+
--set auth.enabled=true \
174+
--wait --timeout=5m
175+
156176
- name: Deploy inference workload via ModelDeployment
157177
run: |
158178
set -euo pipefail
159179
echo "── Installing modeldeployment chart ──"
160180
echo " release / name : ${DEPLOYMENT_NAME}"
161-
echo " namespace : ${GATEWAY_NAMESPACE}"
181+
echo " namespace : ${WORKLOAD_NAMESPACE}"
162182
echo " preset model : ${MODEL_PRESET}"
163183
echo " replicas : ${REPLICAS}"
164184
echo " instance type : ${INSTANCE_TYPE}"
165-
echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE})"
185+
echo " gateway : ${GATEWAY_NAME} (namespace ${WORKLOAD_NAMESPACE})"
166186
167187
helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
168-
--namespace "${GATEWAY_NAMESPACE}" \
188+
--namespace "${WORKLOAD_NAMESPACE}" \
169189
--create-namespace \
170190
--set name="${DEPLOYMENT_NAME}" \
171-
--set namespace="${GATEWAY_NAMESPACE}" \
191+
--set namespace="${WORKLOAD_NAMESPACE}" \
172192
--set model="${MODEL_PRESET}" \
173193
--set replicas="${REPLICAS}" \
174194
--set instanceType="${INSTANCE_TYPE}" \
@@ -184,15 +204,15 @@ jobs:
184204
echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"
185205
186206
# Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
187-
kubectl -n "${GATEWAY_NAMESPACE}" rollout status \
207+
kubectl -n "${WORKLOAD_NAMESPACE}" rollout status \
188208
"deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m
189209
190210
# Wait for the inference (shadow) pods. KAITO labels each
191211
# inference pod with `inferenceset.kaito.sh/created-by=<name>`.
192212
deadline=$(( $(date +%s) + 600 ))
193213
ready=0
194214
while [ "$(date +%s)" -lt "${deadline}" ]; do
195-
ready=$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
215+
ready=$(kubectl -n "${WORKLOAD_NAMESPACE}" get pods \
196216
-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
197217
-o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
198218
2>/dev/null | grep -c '^True$' || true)
@@ -205,23 +225,23 @@ jobs:
205225
done
206226
if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
207227
echo "✗ inference pods did not all become Ready within 10m"
208-
kubectl -n "${GATEWAY_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
228+
kubectl -n "${WORKLOAD_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
209229
exit 1
210230
fi
211231
212232
# ---------- 2. Wait for the default APIKey Secret ----------
213-
echo "── Waiting for Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
233+
echo "── Waiting for Secret ${WORKLOAD_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
214234
secret_found=0
215235
for _ in $(seq 1 60); do
216-
if kubectl -n "${GATEWAY_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
236+
if kubectl -n "${WORKLOAD_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
217237
echo "✓ Secret found"
218238
secret_found=1
219239
break
220240
fi
221241
sleep 5
222242
done
223243
if [ "${secret_found}" -ne 1 ]; then
224-
echo "✗ Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m"
244+
echo "✗ Secret ${WORKLOAD_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m"
225245
exit 1
226246
fi
227247
@@ -246,7 +266,7 @@ jobs:
246266
# Start kubectl port-forward in the background. Output goes to a
247267
# log file so the workflow can dump it on failure.
248268
mkdir -p "${PF_DIR}"
249-
nohup kubectl -n "${GATEWAY_NAMESPACE}" port-forward \
269+
nohup kubectl -n "${WORKLOAD_NAMESPACE}" port-forward \
250270
"svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
251271
> "${PF_DIR}/port-forward.log" 2>&1 &
252272
PF_PID=$!
@@ -267,7 +287,7 @@ jobs:
267287
exit 1
268288
fi
269289
270-
API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE}" get secret \
290+
API_KEY="$(kubectl -n "${WORKLOAD_NAMESPACE}" get secret \
271291
"${APIKEY_SECRET_NAME}" \
272292
-o jsonpath="{.data.${APIKEY_SECRET_KEY}}" | base64 -d)"
273293
# Use the subdomain-based FQDN (mapped to 127.0.0.1 above) so the
@@ -387,7 +407,7 @@ jobs:
387407
echo "| --- | --- |"
388408
echo "| model preset | \`${MODEL_PRESET}\` |"
389409
echo "| deployment | \`${DEPLOYMENT_NAME}\` |"
390-
echo "| namespace | \`${GATEWAY_NAMESPACE}\` |"
410+
echo "| namespace | \`${WORKLOAD_NAMESPACE}\` |"
391411
echo "| replicas | ${REPLICAS} |"
392412
echo "| instance type | \`${INSTANCE_TYPE}\` |"
393413
echo "| guidellm profile | \`${BENCHMARK_PROFILE}\` |"
@@ -449,14 +469,14 @@ jobs:
449469
# on a real node and labelled `kaito.sh/managed-by=gpu-mocker` /
450470
# `kaito.sh/shadow-pod-for=<ns>.<original-name>`. Resolve each
451471
# original pod to its shadow before scraping /metrics.
452-
ORIGINAL_PODS="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
472+
ORIGINAL_PODS="$(kubectl -n "${WORKLOAD_NAMESPACE}" get pods \
453473
-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
454474
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
455475
456476
PODS=""
457477
for ORIG in ${ORIGINAL_PODS}; do
458-
SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${GATEWAY_NAMESPACE}.${ORIG}"
459-
SHADOW_POD="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
478+
SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${WORKLOAD_NAMESPACE}.${ORIG}"
479+
SHADOW_POD="$(kubectl -n "${WORKLOAD_NAMESPACE}" get pods \
460480
-l "${SHADOW_SELECTOR}" \
461481
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
462482
if [ -z "${SHADOW_POD}" ]; then
@@ -500,8 +520,8 @@ jobs:
500520
for POD in ${PODS}; do
501521
RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
502522
echo "── ${POD} ──"
503-
if ! kubectl -n "${GATEWAY_NAMESPACE}" get \
504-
--raw "/api/v1/namespaces/${GATEWAY_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
523+
if ! kubectl -n "${WORKLOAD_NAMESPACE}" get \
524+
--raw "/api/v1/namespaces/${WORKLOAD_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
505525
> "${RAW}" 2>"${SCRAPE_ERR}"; then
506526
echo " ✗ failed to scrape /metrics:"
507527
sed 's/^/ /' "${SCRAPE_ERR}" || true
@@ -590,14 +610,23 @@ jobs:
590610
591611
# ---------- 2. Uninstall modeldeployment Helm release ----------
592612
helm uninstall "${DEPLOYMENT_NAME}" \
593-
--namespace "${GATEWAY_NAMESPACE}" \
613+
--namespace "${WORKLOAD_NAMESPACE}" \
614+
--ignore-not-found --wait || true
615+
616+
# ---------- 3. Uninstall modelharness Helm release ----------
617+
helm uninstall modelharness \
618+
--namespace "${WORKLOAD_NAMESPACE}" \
594619
--ignore-not-found --wait || true
595620
596-
# ---------- 3. Dump cluster state on failure ----------
621+
# ---------- 4. Delete workload namespace ----------
622+
kubectl delete namespace "${WORKLOAD_NAMESPACE}" \
623+
--ignore-not-found --wait=false || true
624+
625+
# ---------- 5. Dump cluster state on failure ----------
597626
if [ "${JOB_STATUS}" = "failure" ]; then
598627
echo "── Job failed — dumping cluster state ──"
599628
make e2e-dump || true
600629
fi
601630
602-
# ---------- 4. Teardown cluster ----------
631+
# ---------- 6. Teardown cluster ----------
603632
make e2e-teardown || true

0 commit comments

Comments
 (0)