Skip to content

feat: add benchmark workflow #2

feat: add benchmark workflow

feat: add benchmark workflow #2

Workflow file for this run

name: ModelDeployment benchmark
# Workflow that:
# 1. Reuses the E2E base setup composite action to bring up the AKS
# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
# llm-gateway-apikey, gpu-node-mocker, KAITO).
# 2. Installs the modeldeployment Helm chart in the `default` namespace,
# reusing the cluster-wide `inference-gateway` Gateway and the
# cluster-wide `default` APIKey (Secret `llm-api-key`).
# 3. Starts a `kubectl port-forward` against the gateway service so the
# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
# exercised from localhost. The endpoint URL and API key are echoed
# to the workflow log.
# 4. Installs guidellm and runs a sweep benchmark against the endpoint,
# printing TTFT / TPOT / TPM (and the rest of the guidellm console
# summary) into the workflow log.
#
# All component versions and benchmark parameters are hard-coded in the
# `env:` block below — there are no workflow_dispatch inputs. To change a
# value, edit this file (and rely on the pull_request trigger to validate
# the change end-to-end).
on:
# Run on PRs that touch the benchmark workflow or anything it exercises
# (modeldeployment chart, gpu-node-mocker, e2e composite action) so the
# full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from
# a PR.
pull_request:
branches: [ main ]
paths:
- '.github/workflows/benchmark.yaml'
- '.github/actions/e2e-base-setup/**'
- 'charts/modeldeployment/**'
- 'cmd/gpu-node-mocker/**'
- 'pkg/gpu-node-mocker/**'
- 'docker/Dockerfile'
- 'versions.env'
workflow_dispatch:
env:
RESOURCE_GROUP: "kaito-gw-bench-rg-${{ github.run_id }}"
CLUSTER_NAME: "kaito-gw-bench-aks-${{ github.run_id }}"
ACR_NAME: "kaitogwbenchaks${{ github.run_id }}acr"
GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
LOCATION: swedencentral
NODE_COUNT: '3'
NODE_VM_SIZE: Standard_D8s_v5
GATEWAY_NAME: inference-gateway
GATEWAY_NAMESPACE: default
GATEWAY_LOCAL_PORT: '18080'
APIKEY_SECRET_NAME: llm-api-key
APIKEY_SECRET_KEY: apiKey
# Host header sent on every request through the local port-forward.
# The cluster-wide llm-gateway-apikey AuthorizationPolicy resolves the
# gateway namespace from the request's Host header subdomain
# (`<namespace>.gw.example.com`). Without this header the authz service
# rejects the request with HTTP 401
# `cannot determine gateway namespace: set context_extensions[gateway-namespace] or use subdomain-based host`.
# The e2e suite uses the same subdomain convention (see
# SendChatCompletionWithAuth in test/e2e/utils/http.go).
GATEWAY_HOSTNAME: 'default.gw.example.com'
# ----- Benchmark parameters (previously workflow_dispatch inputs) -----
# KAITO inference preset to benchmark (must exist in KAITO main HEAD
# model_catalog.yaml).
MODEL_PRESET: 'phi-4-mini-instruct'
# Helm release / InferenceSet name. Also the value of the `model` field
# in OpenAI requests (matched by the HTTPRoute as X-Gateway-Model-Name).
DEPLOYMENT_NAME: 'benchmark-phi'
# VM instance type passed to the InferenceSet (must be a SKU the
# gpu-node-mocker labels as a GPU node).
INSTANCE_TYPE: 'Standard_NV36ads_A10_v5'
# guidellm benchmark profile (sweep / throughput / concurrent /
# constant / poisson / synchronous).
BENCHMARK_PROFILE: 'sweep'
# Maximum seconds per guidellm sub-benchmark.
BENCHMARK_MAX_SECONDS: '60'
# guidellm --data spec (synthetic data config or HF dataset).
BENCHMARK_DATA: 'prompt_tokens=512,output_tokens=128'
# The benchmark always runs against a single InferenceSet replica so the
# numbers from guidellm and the per-pod vLLM metrics describe the same
# backend.
REPLICAS: '1'
# Port the vLLM (mock) model server listens on inside each inference pod.
# Matches `epp.modelServerPort` in charts/modeldeployment/values.yaml
# (KAITO PortInferenceServer = 5000) and is the same port that exposes
# the Prometheus `/metrics` endpoint with the `vllm:*` series.
MODEL_SERVER_PORT: '5000'
permissions:
contents: read
jobs:
benchmark:
runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ]
environment: e2e-test
permissions:
contents: read
steps:
- name: Checkout Repository
uses: actions/checkout@v6
with:
ref: ${{ github.ref }}
- name: E2E base setup
uses: ./.github/actions/e2e-base-setup
with:
resource-group: ${{ env.RESOURCE_GROUP }}
cluster-name: ${{ env.CLUSTER_NAME }}
acr-name: ${{ env.ACR_NAME }}
gpu-mocker-image: ${{ env.GPU_MOCKER_IMAGE }}
location: ${{ env.LOCATION }}
node-count: ${{ env.NODE_COUNT }}
node-vm-size: ${{ env.NODE_VM_SIZE }}
# All component versions fall through to the defaults baked into
# versions.env (consumed by the composite action).
istio-version: ''
gateway-api-version: ''
bbr-version: ''
keda-version: ''
keda-kaito-scaler-version: ''
llm-gateway-auth-version: ''
- name: Install modeldeployment Helm release
run: |
set -euo pipefail
echo "── Installing modeldeployment chart ──"
echo " release / name : ${DEPLOYMENT_NAME}"
echo " namespace : ${GATEWAY_NAMESPACE}"
echo " preset model : ${MODEL_PRESET}"
echo " replicas : ${REPLICAS}"
echo " instance type : ${INSTANCE_TYPE}"
echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE})"
helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
--namespace "${GATEWAY_NAMESPACE}" \
--create-namespace \
--set name="${DEPLOYMENT_NAME}" \
--set namespace="${GATEWAY_NAMESPACE}" \
--set model="${MODEL_PRESET}" \
--set replicas="${REPLICAS}" \
--set instanceType="${INSTANCE_TYPE}" \
--set gatewayName="${GATEWAY_NAME}" \
--wait --timeout=5m
- name: Wait for inference pods to become Ready
run: |
set -euo pipefail
echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"
# Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
kubectl -n "${GATEWAY_NAMESPACE}" rollout status \
"deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m
# Wait for the inference (shadow) pods. KAITO labels each
# inference pod with `inferenceset.kaito.sh/created-by=<name>`.
deadline=$(( $(date +%s) + 600 ))
while [ "$(date +%s)" -lt "${deadline}" ]; do
ready=$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
-o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
2>/dev/null | grep -c '^True$' || true)
echo " ready pods: ${ready}/${REPLICAS}"
if [ "${ready}" -ge "${REPLICAS}" ]; then
echo "✓ all ${REPLICAS} inference pods are Ready"
break
fi
sleep 10
done
if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
echo "✗ inference pods did not all become Ready within 10m"
kubectl -n "${GATEWAY_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
exit 1
fi
- name: Wait for default APIKey Secret
run: |
set -euo pipefail
echo "── Waiting for Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
for _ in $(seq 1 60); do
if kubectl -n "${GATEWAY_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
echo "✓ Secret found"
exit 0
fi
sleep 5
done
echo "✗ Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m"
exit 1
- name: Start gateway port-forward and publish endpoint / API key
id: endpoint
run: |
set -euo pipefail
SERVICE="${GATEWAY_NAME}-istio"
# Map the apikey-authz subdomain to localhost so every request
# going through the kubectl port-forward carries the correct
# Host header (`<namespace>.gw.example.com`) without having to
# hand-craft it for every client (curl, guidellm/httpx).
if ! grep -qE "[[:space:]]${GATEWAY_HOSTNAME}([[:space:]]|$)" /etc/hosts; then
echo "127.0.0.1 ${GATEWAY_HOSTNAME}" | sudo tee -a /etc/hosts >/dev/null
echo "✓ added ${GATEWAY_HOSTNAME} → 127.0.0.1 to /etc/hosts"
fi
# Start kubectl port-forward in the background. Output goes to a
# log file so the workflow can dump it on failure.
mkdir -p /tmp/benchmark-pf
nohup kubectl -n "${GATEWAY_NAMESPACE}" port-forward \
"svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
> /tmp/benchmark-pf/port-forward.log 2>&1 &
PF_PID=$!
echo "${PF_PID}" > /tmp/benchmark-pf/pid
echo "started kubectl port-forward (pid=${PF_PID})"
# Wait for the local port to accept TCP connections.
for _ in $(seq 1 30); do
if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
echo "✓ port-forward ready on localhost:${GATEWAY_LOCAL_PORT}"
break
fi
sleep 1
done
if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
echo "✗ port-forward did not become ready"
cat /tmp/benchmark-pf/port-forward.log || true
exit 1
fi
API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE}" get secret \
"${APIKEY_SECRET_NAME}" \
-o jsonpath="{.data.${APIKEY_SECRET_KEY}}" | base64 -d)"
# Use the apikey-authz hostname in the URL itself — with the
# /etc/hosts entry above this still resolves to 127.0.0.1 and
# hits the kubectl port-forward, but curl/httpx now set
# `Host: ${GATEWAY_HOSTNAME}` automatically.
ENDPOINT="http://${GATEWAY_HOSTNAME}:${GATEWAY_LOCAL_PORT}"
echo ""
echo "════════════════════════════════════════════════════════════════════"
echo "ModelDeployment endpoint ready"
echo " endpoint : ${ENDPOINT}"
echo " model : ${DEPLOYMENT_NAME}"
echo " api key : ${API_KEY}"
echo ""
echo " Example smoke test:"
echo " curl -sS -H \"Authorization: Bearer ${API_KEY}\" \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\":\"${DEPLOYMENT_NAME}\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}' \\"
echo " ${ENDPOINT}/v1/chat/completions"
echo "════════════════════════════════════════════════════════════════════"
echo ""
# Mask the API key in subsequent log output.
echo "::add-mask::${API_KEY}"
# Publish to later steps (the literal value is masked above).
{
echo "endpoint=${ENDPOINT}"
echo "api_key=${API_KEY}"
} >> "${GITHUB_OUTPUT}"
- name: Smoke-test the endpoint
env:
ENDPOINT: ${{ steps.endpoint.outputs.endpoint }}
API_KEY: ${{ steps.endpoint.outputs.api_key }}
MODEL: ${{ env.DEPLOYMENT_NAME }}
run: |
set -euo pipefail
echo "── Sending a single OpenAI-format request to verify the path Gateway → BBR → EPP → pod ──"
curl -sS --fail-with-body \
-H "Authorization: Bearer ${API_KEY}" \
-H 'Content-Type: application/json' \
-d "{\"model\":\"${MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":16}" \
"${ENDPOINT}/v1/chat/completions" | tee /tmp/benchmark-pf/smoke.json
echo ""
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install guidellm
run: |
set -euo pipefail
python -m pip install --upgrade pip
# Mirrors the version pattern used by KAITO's vLLM benchmark
# entrypoint (https://github.com/kaito-project/kaito/blob/main/presets/workspace/inference/vllm/benchmark_entrypoint.py).
pip install 'guidellm[recommended]'
guidellm --version
- name: Run guidellm benchmark
env:
ENDPOINT: ${{ steps.endpoint.outputs.endpoint }}
API_KEY: ${{ steps.endpoint.outputs.api_key }}
MODEL: ${{ env.DEPLOYMENT_NAME }}
run: |
set -euo pipefail
mkdir -p ./benchmark-results
echo "── Running guidellm against ${ENDPOINT} (model=${MODEL}) ──"
echo " profile=${BENCHMARK_PROFILE} max-seconds=${BENCHMARK_MAX_SECONDS} data=${BENCHMARK_DATA}"
echo ""
# `--backend-kwargs` carries the OpenAI bearer token used by the
# llm-gateway-apikey AuthorizationPolicy guarding the
# `inference-gateway`. The console output below contains the
# TTFT, TPOT (per-output-token latency) and request/token
# throughput tables that satisfy the "TTFT / TPOT / TPM" report
# required by the workflow spec.
guidellm benchmark run \
--target "${ENDPOINT}" \
--model "${MODEL}" \
--backend-kwargs "{\"api_key\":\"${API_KEY}\",\"verify\":false}" \
--profile "${BENCHMARK_PROFILE}" \
--max-seconds "${BENCHMARK_MAX_SECONDS}" \
--data "${BENCHMARK_DATA}" \
--output-dir ./benchmark-results \
--outputs json \
--outputs csv \
--disable-console-interactive
echo ""
echo "── benchmark-results contents ──"
ls -al ./benchmark-results || true
- name: Scrape vLLM pod metrics
if: always()
run: |
set -euo pipefail
mkdir -p ./benchmark-results/vllm-metrics
echo "── Scraping vLLM /metrics from each inference pod (port ${MODEL_SERVER_PORT}) ──"
echo " Cross-checks the guidellm summary (TTFT / TPOT / throughput) against the"
echo " authoritative numbers reported by the vLLM (mock) model server itself."
echo ""
PODS="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
if [ -z "${PODS}" ]; then
echo "✗ no inference pods found for ${DEPLOYMENT_NAME}"
exit 0
fi
# Metrics of interest. vLLM exposes TTFT / TPOT / e2e latency as
# Prometheus histograms (the *_sum / *_count series let us compute
# accurate averages over the entire benchmark window).
KEY_METRICS='^(vllm:num_requests_running|vllm:num_requests_waiting|vllm:request_success_total|vllm:request_failure_total|vllm:prompt_tokens_total|vllm:generation_tokens_total|vllm:time_to_first_token_seconds_(sum|count)|vllm:time_per_output_token_seconds_(sum|count)|vllm:e2e_request_latency_seconds_(sum|count)|vllm:request_prompt_tokens_(sum|count)|vllm:request_generation_tokens_(sum|count)|vllm:kv_cache_usage_perc|vllm:gpu_cache_usage_perc)( |\\{)'
for POD in ${PODS}; do
RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
echo "── ${POD} ──"
if ! kubectl -n "${GATEWAY_NAMESPACE}" get \
--raw "/api/v1/namespaces/${GATEWAY_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
> "${RAW}" 2>/tmp/scrape.err; then
echo " ✗ failed to scrape /metrics:"
sed 's/^/ /' /tmp/scrape.err || true
continue
fi
echo " raw metrics saved to ${RAW} ($(wc -l < "${RAW}") lines)"
echo ""
echo " key vllm:* series:"
grep -E "${KEY_METRICS}" "${RAW}" | sed 's/^/ /' || echo " (no matching series found)"
echo ""
# Derive averages from the histogram *_sum / *_count pairs so the
# workflow log carries a directly comparable TTFT / TPOT / e2e
# latency next to guidellm's own report. Implemented in awk to
# avoid a python heredoc (the YAML block-scalar indentation
# would prevent the closing tag from being recognised).
awk -v pod="${POD}" '
/^[#]/ { next }
/^[[:space:]]*$/ { next }
{
# strip optional {label=...} block before the value
name = $1
sub(/\{.*\}/, "", name)
val = $NF + 0
tot[name] += val
}
END {
printf(" vLLM-derived summary for pod %s:\n", pod)
printf(" requests succeeded : %d\n", tot["vllm:request_success_total"])
printf(" requests failed : %d\n", tot["vllm:request_failure_total"])
printf(" prompt tokens total : %d\n", tot["vllm:prompt_tokens_total"])
printf(" generation tokens total : %d\n", tot["vllm:generation_tokens_total"])
ttft_n = tot["vllm:time_to_first_token_seconds_count"]
tpot_n = tot["vllm:time_per_output_token_seconds_count"]
e2e_n = tot["vllm:e2e_request_latency_seconds_count"]
if (ttft_n > 0) printf(" avg TTFT (time-to-first-tok) : %.2f ms\n", 1000 * tot["vllm:time_to_first_token_seconds_sum"] / ttft_n); else print " avg TTFT (time-to-first-tok) : n/a"
if (tpot_n > 0) printf(" avg TPOT (per-output-token) : %.2f ms\n", 1000 * tot["vllm:time_per_output_token_seconds_sum"] / tpot_n); else print " avg TPOT (per-output-token) : n/a"
if (e2e_n > 0) printf(" avg e2e (request latency) : %.2f ms\n", 1000 * tot["vllm:e2e_request_latency_seconds_sum"] / e2e_n); else print " avg e2e (request latency) : n/a"
}
' "${RAW}" || true
echo ""
done
- name: Upload guidellm results
if: always()
uses: actions/upload-artifact@v4
with:
name: guidellm-benchmark-results-${{ github.run_id }}
path: ./benchmark-results
if-no-files-found: warn
- name: Stop gateway port-forward
if: always()
run: |
set +e
if [ -f /tmp/benchmark-pf/pid ]; then
PF_PID="$(cat /tmp/benchmark-pf/pid)"
echo "stopping kubectl port-forward (pid=${PF_PID})"
kill "${PF_PID}" 2>/dev/null || true
fi
if [ -f /tmp/benchmark-pf/port-forward.log ]; then
echo "── port-forward log tail ──"
tail -n 50 /tmp/benchmark-pf/port-forward.log || true
fi
- name: Uninstall modeldeployment Helm release
if: always()
run: |
helm uninstall "${DEPLOYMENT_NAME}" \
--namespace "${GATEWAY_NAMESPACE}" \
--ignore-not-found --wait || true
- name: Dump cluster state
if: failure()
run: make e2e-dump
- name: Teardown cluster
if: always()
run: make e2e-teardown
env:
RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }}