feat: add benchmark workflow #2
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: ModelDeployment benchmark | |
| # Workflow that: | |
| # 1. Reuses the E2E base setup composite action to bring up the AKS | |
| # cluster + all stack components (Istio gateway, BBR, EPP, KEDA, | |
| # llm-gateway-apikey, gpu-node-mocker, KAITO). | |
| # 2. Installs the modeldeployment Helm chart in the `default` namespace, | |
| # reusing the cluster-wide `inference-gateway` Gateway and the | |
| # cluster-wide `default` APIKey (Secret `llm-api-key`). | |
| # 3. Starts a `kubectl port-forward` against the gateway service so the | |
| # complete request path Gateway → BBR → EPP → vLLM (mock) pod can be | |
| # exercised from localhost. The endpoint URL and API key are echoed | |
| # to the workflow log. | |
| # 4. Installs guidellm and runs a sweep benchmark against the endpoint, | |
| # printing TTFT / TPOT / TPM (and the rest of the guidellm console | |
| # summary) into the workflow log. | |
| # | |
| # All component versions and benchmark parameters are hard-coded in the | |
| # `env:` block below — there are no workflow_dispatch inputs. To change a | |
| # value, edit this file (and rely on the pull_request trigger to validate | |
| # the change end-to-end). | |
| on: | |
| # Run on PRs that touch the benchmark workflow or anything it exercises | |
| # (modeldeployment chart, gpu-node-mocker, e2e composite action) so the | |
| # full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from | |
| # a PR. | |
| pull_request: | |
| branches: [ main ] | |
| paths: | |
| - '.github/workflows/benchmark.yaml' | |
| - '.github/actions/e2e-base-setup/**' | |
| - 'charts/modeldeployment/**' | |
| - 'cmd/gpu-node-mocker/**' | |
| - 'pkg/gpu-node-mocker/**' | |
| - 'docker/Dockerfile' | |
| - 'versions.env' | |
| workflow_dispatch: | |
| env: | |
| RESOURCE_GROUP: "kaito-gw-bench-rg-${{ github.run_id }}" | |
| CLUSTER_NAME: "kaito-gw-bench-aks-${{ github.run_id }}" | |
| ACR_NAME: "kaitogwbenchaks${{ github.run_id }}acr" | |
| GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}" | |
| LOCATION: swedencentral | |
| NODE_COUNT: '3' | |
| NODE_VM_SIZE: Standard_D8s_v5 | |
| GATEWAY_NAME: inference-gateway | |
| GATEWAY_NAMESPACE: default | |
| GATEWAY_LOCAL_PORT: '18080' | |
| APIKEY_SECRET_NAME: llm-api-key | |
| APIKEY_SECRET_KEY: apiKey | |
| # Host header sent on every request through the local port-forward. | |
| # The cluster-wide llm-gateway-apikey AuthorizationPolicy resolves the | |
| # gateway namespace from the request's Host header subdomain | |
| # (`<namespace>.gw.example.com`). Without this header the authz service | |
| # rejects the request with HTTP 401 | |
| # `cannot determine gateway namespace: set context_extensions[gateway-namespace] or use subdomain-based host`. | |
| # The e2e suite uses the same subdomain convention (see | |
| # SendChatCompletionWithAuth in test/e2e/utils/http.go). | |
| GATEWAY_HOSTNAME: 'default.gw.example.com' | |
| # ----- Benchmark parameters (previously workflow_dispatch inputs) ----- | |
| # KAITO inference preset to benchmark (must exist in KAITO main HEAD | |
| # model_catalog.yaml). | |
| MODEL_PRESET: 'phi-4-mini-instruct' | |
| # Helm release / InferenceSet name. Also the value of the `model` field | |
| # in OpenAI requests (matched by the HTTPRoute as X-Gateway-Model-Name). | |
| DEPLOYMENT_NAME: 'benchmark-phi' | |
| # VM instance type passed to the InferenceSet (must be a SKU the | |
| # gpu-node-mocker labels as a GPU node). | |
| INSTANCE_TYPE: 'Standard_NV36ads_A10_v5' | |
| # guidellm benchmark profile (sweep / throughput / concurrent / | |
| # constant / poisson / synchronous). | |
| BENCHMARK_PROFILE: 'sweep' | |
| # Maximum seconds per guidellm sub-benchmark. | |
| BENCHMARK_MAX_SECONDS: '60' | |
| # guidellm --data spec (synthetic data config or HF dataset). | |
| BENCHMARK_DATA: 'prompt_tokens=512,output_tokens=128' | |
| # The benchmark always runs against a single InferenceSet replica so the | |
| # numbers from guidellm and the per-pod vLLM metrics describe the same | |
| # backend. | |
| REPLICAS: '1' | |
| # Port the vLLM (mock) model server listens on inside each inference pod. | |
| # Matches `epp.modelServerPort` in charts/modeldeployment/values.yaml | |
| # (KAITO PortInferenceServer = 5000) and is the same port that exposes | |
| # the Prometheus `/metrics` endpoint with the `vllm:*` series. | |
| MODEL_SERVER_PORT: '5000' | |
| permissions: | |
| contents: read | |
| jobs: | |
| benchmark: | |
| runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ] | |
| environment: e2e-test | |
| permissions: | |
| contents: read | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.ref }} | |
| - name: E2E base setup | |
| uses: ./.github/actions/e2e-base-setup | |
| with: | |
| resource-group: ${{ env.RESOURCE_GROUP }} | |
| cluster-name: ${{ env.CLUSTER_NAME }} | |
| acr-name: ${{ env.ACR_NAME }} | |
| gpu-mocker-image: ${{ env.GPU_MOCKER_IMAGE }} | |
| location: ${{ env.LOCATION }} | |
| node-count: ${{ env.NODE_COUNT }} | |
| node-vm-size: ${{ env.NODE_VM_SIZE }} | |
| # All component versions fall through to the defaults baked into | |
| # versions.env (consumed by the composite action). | |
| istio-version: '' | |
| gateway-api-version: '' | |
| bbr-version: '' | |
| keda-version: '' | |
| keda-kaito-scaler-version: '' | |
| llm-gateway-auth-version: '' | |
| - name: Install modeldeployment Helm release | |
| run: | | |
| set -euo pipefail | |
| echo "── Installing modeldeployment chart ──" | |
| echo " release / name : ${DEPLOYMENT_NAME}" | |
| echo " namespace : ${GATEWAY_NAMESPACE}" | |
| echo " preset model : ${MODEL_PRESET}" | |
| echo " replicas : ${REPLICAS}" | |
| echo " instance type : ${INSTANCE_TYPE}" | |
| echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE})" | |
| helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \ | |
| --namespace "${GATEWAY_NAMESPACE}" \ | |
| --create-namespace \ | |
| --set name="${DEPLOYMENT_NAME}" \ | |
| --set namespace="${GATEWAY_NAMESPACE}" \ | |
| --set model="${MODEL_PRESET}" \ | |
| --set replicas="${REPLICAS}" \ | |
| --set instanceType="${INSTANCE_TYPE}" \ | |
| --set gatewayName="${GATEWAY_NAME}" \ | |
| --wait --timeout=5m | |
| - name: Wait for inference pods to become Ready | |
| run: | | |
| set -euo pipefail | |
| echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──" | |
| # Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches). | |
| kubectl -n "${GATEWAY_NAMESPACE}" rollout status \ | |
| "deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m | |
| # Wait for the inference (shadow) pods. KAITO labels each | |
| # inference pod with `inferenceset.kaito.sh/created-by=<name>`. | |
| deadline=$(( $(date +%s) + 600 )) | |
| while [ "$(date +%s)" -lt "${deadline}" ]; do | |
| ready=$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \ | |
| -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \ | |
| -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \ | |
| 2>/dev/null | grep -c '^True$' || true) | |
| echo " ready pods: ${ready}/${REPLICAS}" | |
| if [ "${ready}" -ge "${REPLICAS}" ]; then | |
| echo "✓ all ${REPLICAS} inference pods are Ready" | |
| break | |
| fi | |
| sleep 10 | |
| done | |
| if [ "${ready:-0}" -lt "${REPLICAS}" ]; then | |
| echo "✗ inference pods did not all become Ready within 10m" | |
| kubectl -n "${GATEWAY_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide | |
| exit 1 | |
| fi | |
| - name: Wait for default APIKey Secret | |
| run: | | |
| set -euo pipefail | |
| echo "── Waiting for Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──" | |
| for _ in $(seq 1 60); do | |
| if kubectl -n "${GATEWAY_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then | |
| echo "✓ Secret found" | |
| exit 0 | |
| fi | |
| sleep 5 | |
| done | |
| echo "✗ Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m" | |
| exit 1 | |
| - name: Start gateway port-forward and publish endpoint / API key | |
| id: endpoint | |
| run: | | |
| set -euo pipefail | |
| SERVICE="${GATEWAY_NAME}-istio" | |
| # Map the apikey-authz subdomain to localhost so every request | |
| # going through the kubectl port-forward carries the correct | |
| # Host header (`<namespace>.gw.example.com`) without having to | |
| # hand-craft it for every client (curl, guidellm/httpx). | |
| if ! grep -qE "[[:space:]]${GATEWAY_HOSTNAME}([[:space:]]|$)" /etc/hosts; then | |
| echo "127.0.0.1 ${GATEWAY_HOSTNAME}" | sudo tee -a /etc/hosts >/dev/null | |
| echo "✓ added ${GATEWAY_HOSTNAME} → 127.0.0.1 to /etc/hosts" | |
| fi | |
| # Start kubectl port-forward in the background. Output goes to a | |
| # log file so the workflow can dump it on failure. | |
| mkdir -p /tmp/benchmark-pf | |
| nohup kubectl -n "${GATEWAY_NAMESPACE}" port-forward \ | |
| "svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \ | |
| > /tmp/benchmark-pf/port-forward.log 2>&1 & | |
| PF_PID=$! | |
| echo "${PF_PID}" > /tmp/benchmark-pf/pid | |
| echo "started kubectl port-forward (pid=${PF_PID})" | |
| # Wait for the local port to accept TCP connections. | |
| for _ in $(seq 1 30); do | |
| if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then | |
| echo "✓ port-forward ready on localhost:${GATEWAY_LOCAL_PORT}" | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then | |
| echo "✗ port-forward did not become ready" | |
| cat /tmp/benchmark-pf/port-forward.log || true | |
| exit 1 | |
| fi | |
| API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE}" get secret \ | |
| "${APIKEY_SECRET_NAME}" \ | |
| -o jsonpath="{.data.${APIKEY_SECRET_KEY}}" | base64 -d)" | |
| # Use the apikey-authz hostname in the URL itself — with the | |
| # /etc/hosts entry above this still resolves to 127.0.0.1 and | |
| # hits the kubectl port-forward, but curl/httpx now set | |
| # `Host: ${GATEWAY_HOSTNAME}` automatically. | |
| ENDPOINT="http://${GATEWAY_HOSTNAME}:${GATEWAY_LOCAL_PORT}" | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════════" | |
| echo "ModelDeployment endpoint ready" | |
| echo " endpoint : ${ENDPOINT}" | |
| echo " model : ${DEPLOYMENT_NAME}" | |
| echo " api key : ${API_KEY}" | |
| echo "" | |
| echo " Example smoke test:" | |
| echo " curl -sS -H \"Authorization: Bearer ${API_KEY}\" \\" | |
| echo " -H 'Content-Type: application/json' \\" | |
| echo " -d '{\"model\":\"${DEPLOYMENT_NAME}\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}' \\" | |
| echo " ${ENDPOINT}/v1/chat/completions" | |
| echo "════════════════════════════════════════════════════════════════════" | |
| echo "" | |
| # Mask the API key in subsequent log output. | |
| echo "::add-mask::${API_KEY}" | |
| # Publish to later steps (the literal value is masked above). | |
| { | |
| echo "endpoint=${ENDPOINT}" | |
| echo "api_key=${API_KEY}" | |
| } >> "${GITHUB_OUTPUT}" | |
| - name: Smoke-test the endpoint | |
| env: | |
| ENDPOINT: ${{ steps.endpoint.outputs.endpoint }} | |
| API_KEY: ${{ steps.endpoint.outputs.api_key }} | |
| MODEL: ${{ env.DEPLOYMENT_NAME }} | |
| run: | | |
| set -euo pipefail | |
| echo "── Sending a single OpenAI-format request to verify the path Gateway → BBR → EPP → pod ──" | |
| curl -sS --fail-with-body \ | |
| -H "Authorization: Bearer ${API_KEY}" \ | |
| -H 'Content-Type: application/json' \ | |
| -d "{\"model\":\"${MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":16}" \ | |
| "${ENDPOINT}/v1/chat/completions" | tee /tmp/benchmark-pf/smoke.json | |
| echo "" | |
| - name: Setup Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.12' | |
| - name: Install guidellm | |
| run: | | |
| set -euo pipefail | |
| python -m pip install --upgrade pip | |
| # Mirrors the version pattern used by KAITO's vLLM benchmark | |
| # entrypoint (https://github.com/kaito-project/kaito/blob/main/presets/workspace/inference/vllm/benchmark_entrypoint.py). | |
| pip install 'guidellm[recommended]' | |
| guidellm --version | |
| - name: Run guidellm benchmark | |
| env: | |
| ENDPOINT: ${{ steps.endpoint.outputs.endpoint }} | |
| API_KEY: ${{ steps.endpoint.outputs.api_key }} | |
| MODEL: ${{ env.DEPLOYMENT_NAME }} | |
| run: | | |
| set -euo pipefail | |
| mkdir -p ./benchmark-results | |
| echo "── Running guidellm against ${ENDPOINT} (model=${MODEL}) ──" | |
| echo " profile=${BENCHMARK_PROFILE} max-seconds=${BENCHMARK_MAX_SECONDS} data=${BENCHMARK_DATA}" | |
| echo "" | |
| # `--backend-kwargs` carries the OpenAI bearer token used by the | |
| # llm-gateway-apikey AuthorizationPolicy guarding the | |
| # `inference-gateway`. The console output below contains the | |
| # TTFT, TPOT (per-output-token latency) and request/token | |
| # throughput tables that satisfy the "TTFT / TPOT / TPM" report | |
| # required by the workflow spec. | |
| guidellm benchmark run \ | |
| --target "${ENDPOINT}" \ | |
| --model "${MODEL}" \ | |
| --backend-kwargs "{\"api_key\":\"${API_KEY}\",\"verify\":false}" \ | |
| --profile "${BENCHMARK_PROFILE}" \ | |
| --max-seconds "${BENCHMARK_MAX_SECONDS}" \ | |
| --data "${BENCHMARK_DATA}" \ | |
| --output-dir ./benchmark-results \ | |
| --outputs json \ | |
| --outputs csv \ | |
| --disable-console-interactive | |
| echo "" | |
| echo "── benchmark-results contents ──" | |
| ls -al ./benchmark-results || true | |
| - name: Scrape vLLM pod metrics | |
| if: always() | |
| run: | | |
| set -euo pipefail | |
| mkdir -p ./benchmark-results/vllm-metrics | |
| echo "── Scraping vLLM /metrics from each inference pod (port ${MODEL_SERVER_PORT}) ──" | |
| echo " Cross-checks the guidellm summary (TTFT / TPOT / throughput) against the" | |
| echo " authoritative numbers reported by the vLLM (mock) model server itself." | |
| echo "" | |
| PODS="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \ | |
| -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \ | |
| -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')" | |
| if [ -z "${PODS}" ]; then | |
| echo "✗ no inference pods found for ${DEPLOYMENT_NAME}" | |
| exit 0 | |
| fi | |
| # Metrics of interest. vLLM exposes TTFT / TPOT / e2e latency as | |
| # Prometheus histograms (the *_sum / *_count series let us compute | |
| # accurate averages over the entire benchmark window). | |
| KEY_METRICS='^(vllm:num_requests_running|vllm:num_requests_waiting|vllm:request_success_total|vllm:request_failure_total|vllm:prompt_tokens_total|vllm:generation_tokens_total|vllm:time_to_first_token_seconds_(sum|count)|vllm:time_per_output_token_seconds_(sum|count)|vllm:e2e_request_latency_seconds_(sum|count)|vllm:request_prompt_tokens_(sum|count)|vllm:request_generation_tokens_(sum|count)|vllm:kv_cache_usage_perc|vllm:gpu_cache_usage_perc)( |\\{)' | |
| for POD in ${PODS}; do | |
| RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt" | |
| echo "── ${POD} ──" | |
| if ! kubectl -n "${GATEWAY_NAMESPACE}" get \ | |
| --raw "/api/v1/namespaces/${GATEWAY_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \ | |
| > "${RAW}" 2>/tmp/scrape.err; then | |
| echo " ✗ failed to scrape /metrics:" | |
| sed 's/^/ /' /tmp/scrape.err || true | |
| continue | |
| fi | |
| echo " raw metrics saved to ${RAW} ($(wc -l < "${RAW}") lines)" | |
| echo "" | |
| echo " key vllm:* series:" | |
| grep -E "${KEY_METRICS}" "${RAW}" | sed 's/^/ /' || echo " (no matching series found)" | |
| echo "" | |
| # Derive averages from the histogram *_sum / *_count pairs so the | |
| # workflow log carries a directly comparable TTFT / TPOT / e2e | |
| # latency next to guidellm's own report. Implemented in awk to | |
| # avoid a python heredoc (the YAML block-scalar indentation | |
| # would prevent the closing tag from being recognised). | |
| awk -v pod="${POD}" ' | |
| /^[#]/ { next } | |
| /^[[:space:]]*$/ { next } | |
| { | |
| # strip optional {label=...} block before the value | |
| name = $1 | |
| sub(/\{.*\}/, "", name) | |
| val = $NF + 0 | |
| tot[name] += val | |
| } | |
| END { | |
| printf(" vLLM-derived summary for pod %s:\n", pod) | |
| printf(" requests succeeded : %d\n", tot["vllm:request_success_total"]) | |
| printf(" requests failed : %d\n", tot["vllm:request_failure_total"]) | |
| printf(" prompt tokens total : %d\n", tot["vllm:prompt_tokens_total"]) | |
| printf(" generation tokens total : %d\n", tot["vllm:generation_tokens_total"]) | |
| ttft_n = tot["vllm:time_to_first_token_seconds_count"] | |
| tpot_n = tot["vllm:time_per_output_token_seconds_count"] | |
| e2e_n = tot["vllm:e2e_request_latency_seconds_count"] | |
| if (ttft_n > 0) printf(" avg TTFT (time-to-first-tok) : %.2f ms\n", 1000 * tot["vllm:time_to_first_token_seconds_sum"] / ttft_n); else print " avg TTFT (time-to-first-tok) : n/a" | |
| if (tpot_n > 0) printf(" avg TPOT (per-output-token) : %.2f ms\n", 1000 * tot["vllm:time_per_output_token_seconds_sum"] / tpot_n); else print " avg TPOT (per-output-token) : n/a" | |
| if (e2e_n > 0) printf(" avg e2e (request latency) : %.2f ms\n", 1000 * tot["vllm:e2e_request_latency_seconds_sum"] / e2e_n); else print " avg e2e (request latency) : n/a" | |
| } | |
| ' "${RAW}" || true | |
| echo "" | |
| done | |
| - name: Upload guidellm results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: guidellm-benchmark-results-${{ github.run_id }} | |
| path: ./benchmark-results | |
| if-no-files-found: warn | |
| - name: Stop gateway port-forward | |
| if: always() | |
| run: | | |
| set +e | |
| if [ -f /tmp/benchmark-pf/pid ]; then | |
| PF_PID="$(cat /tmp/benchmark-pf/pid)" | |
| echo "stopping kubectl port-forward (pid=${PF_PID})" | |
| kill "${PF_PID}" 2>/dev/null || true | |
| fi | |
| if [ -f /tmp/benchmark-pf/port-forward.log ]; then | |
| echo "── port-forward log tail ──" | |
| tail -n 50 /tmp/benchmark-pf/port-forward.log || true | |
| fi | |
| - name: Uninstall modeldeployment Helm release | |
| if: always() | |
| run: | | |
| helm uninstall "${DEPLOYMENT_NAME}" \ | |
| --namespace "${GATEWAY_NAMESPACE}" \ | |
| --ignore-not-found --wait || true | |
| - name: Dump cluster state | |
| if: failure() | |
| run: make e2e-dump | |
| - name: Teardown cluster | |
| if: always() | |
| run: make e2e-teardown | |
| env: | |
| RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }} |