feat: add benchmark workflow #2

Workflow file for this run

.github/workflows/benchmark.yaml at f1de7bf

	name: ModelDeployment benchmark

	# Workflow that:
	# 1. Reuses the E2E base setup composite action to bring up the AKS
	# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
	# llm-gateway-apikey, gpu-node-mocker, KAITO).
	# 2. Installs the modeldeployment Helm chart in the `default` namespace,
	# reusing the cluster-wide `inference-gateway` Gateway and the
	# cluster-wide `default` APIKey (Secret `llm-api-key`).
	# 3. Starts a `kubectl port-forward` against the gateway service so the
	# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
	# exercised from localhost. The endpoint URL and API key are echoed
	# to the workflow log.
	# 4. Installs guidellm and runs a sweep benchmark against the endpoint,
	# printing TTFT / TPOT / TPM (and the rest of the guidellm console
	# summary) into the workflow log.
	#
	# All component versions and benchmark parameters are hard-coded in the
	# `env:` block below — there are no workflow_dispatch inputs. To change a
	# value, edit this file (and rely on the pull_request trigger to validate
	# the change end-to-end).

	on:
	# Run on PRs that touch the benchmark workflow or anything it exercises
	# (modeldeployment chart, gpu-node-mocker, e2e composite action) so the
	# full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from
	# a PR.
	pull_request:
	branches: [ main ]
	paths:
	- '.github/workflows/benchmark.yaml'
	- '.github/actions/e2e-base-setup/**'
	- 'charts/modeldeployment/**'
	- 'cmd/gpu-node-mocker/**'
	- 'pkg/gpu-node-mocker/**'
	- 'docker/Dockerfile'
	- 'versions.env'
	workflow_dispatch:

	env:
	RESOURCE_GROUP: "kaito-gw-bench-rg-${{ github.run_id }}"
	CLUSTER_NAME: "kaito-gw-bench-aks-${{ github.run_id }}"
	ACR_NAME: "kaitogwbenchaks${{ github.run_id }}acr"
	GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
	LOCATION: swedencentral
	NODE_COUNT: '3'
	NODE_VM_SIZE: Standard_D8s_v5
	GATEWAY_NAME: inference-gateway
	GATEWAY_NAMESPACE: default
	GATEWAY_LOCAL_PORT: '18080'
	APIKEY_SECRET_NAME: llm-api-key
	APIKEY_SECRET_KEY: apiKey
	# Host header sent on every request through the local port-forward.
	# The cluster-wide llm-gateway-apikey AuthorizationPolicy resolves the
	# gateway namespace from the request's Host header subdomain
	# (`<namespace>.gw.example.com`). Without this header the authz service
	# rejects the request with HTTP 401
	# `cannot determine gateway namespace: set context_extensions[gateway-namespace] or use subdomain-based host`.
	# The e2e suite uses the same subdomain convention (see
	# SendChatCompletionWithAuth in test/e2e/utils/http.go).
	GATEWAY_HOSTNAME: 'default.gw.example.com'
	# ----- Benchmark parameters (previously workflow_dispatch inputs) -----
	# KAITO inference preset to benchmark (must exist in KAITO main HEAD
	# model_catalog.yaml).
	MODEL_PRESET: 'phi-4-mini-instruct'
	# Helm release / InferenceSet name. Also the value of the `model` field
	# in OpenAI requests (matched by the HTTPRoute as X-Gateway-Model-Name).
	DEPLOYMENT_NAME: 'benchmark-phi'
	# VM instance type passed to the InferenceSet (must be a SKU the
	# gpu-node-mocker labels as a GPU node).
	INSTANCE_TYPE: 'Standard_NV36ads_A10_v5'
	# guidellm benchmark profile (sweep / throughput / concurrent /
	# constant / poisson / synchronous).
	BENCHMARK_PROFILE: 'sweep'
	# Maximum seconds per guidellm sub-benchmark.
	BENCHMARK_MAX_SECONDS: '60'
	# guidellm --data spec (synthetic data config or HF dataset).
	BENCHMARK_DATA: 'prompt_tokens=512,output_tokens=128'
	# The benchmark always runs against a single InferenceSet replica so the
	# numbers from guidellm and the per-pod vLLM metrics describe the same
	# backend.
	REPLICAS: '1'
	# Port the vLLM (mock) model server listens on inside each inference pod.
	# Matches `epp.modelServerPort` in charts/modeldeployment/values.yaml
	# (KAITO PortInferenceServer = 5000) and is the same port that exposes
	# the Prometheus `/metrics` endpoint with the `vllm:*` series.
	MODEL_SERVER_PORT: '5000'

	permissions:
	contents: read

	jobs:
	benchmark:
	runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ]
	environment: e2e-test
	permissions:
	contents: read

	steps:
	- name: Checkout Repository
	uses: actions/checkout@v6
	with:
	ref: ${{ github.ref }}

	- name: E2E base setup
	uses: ./.github/actions/e2e-base-setup
	with:
	resource-group: ${{ env.RESOURCE_GROUP }}
	cluster-name: ${{ env.CLUSTER_NAME }}
	acr-name: ${{ env.ACR_NAME }}
	gpu-mocker-image: ${{ env.GPU_MOCKER_IMAGE }}
	location: ${{ env.LOCATION }}
	node-count: ${{ env.NODE_COUNT }}
	node-vm-size: ${{ env.NODE_VM_SIZE }}
	# All component versions fall through to the defaults baked into
	# versions.env (consumed by the composite action).
	istio-version: ''
	gateway-api-version: ''
	bbr-version: ''
	keda-version: ''
	keda-kaito-scaler-version: ''
	llm-gateway-auth-version: ''

	- name: Install modeldeployment Helm release
	run: \|
	set -euo pipefail
	echo "── Installing modeldeployment chart ──"
	echo " release / name : ${DEPLOYMENT_NAME}"
	echo " namespace : ${GATEWAY_NAMESPACE}"
	echo " preset model : ${MODEL_PRESET}"
	echo " replicas : ${REPLICAS}"
	echo " instance type : ${INSTANCE_TYPE}"
	echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE})"

	helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
	--namespace "${GATEWAY_NAMESPACE}" \
	--create-namespace \
	--set name="${DEPLOYMENT_NAME}" \
	--set namespace="${GATEWAY_NAMESPACE}" \
	--set model="${MODEL_PRESET}" \
	--set replicas="${REPLICAS}" \
	--set instanceType="${INSTANCE_TYPE}" \
	--set gatewayName="${GATEWAY_NAME}" \
	--wait --timeout=5m

	- name: Wait for inference pods to become Ready
	run: \|
	set -euo pipefail
	echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"

	# Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
	kubectl -n "${GATEWAY_NAMESPACE}" rollout status \
	"deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m

	# Wait for the inference (shadow) pods. KAITO labels each
	# inference pod with `inferenceset.kaito.sh/created-by=<name>`.
	deadline=$(( $(date +%s) + 600 ))
	while [ "$(date +%s)" -lt "${deadline}" ]; do
	ready=$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
	-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
	-o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
	2>/dev/null \| grep -c '^True$' \|\| true)
	echo " ready pods: ${ready}/${REPLICAS}"
	if [ "${ready}" -ge "${REPLICAS}" ]; then
	echo "✓ all ${REPLICAS} inference pods are Ready"
	break
	fi
	sleep 10
	done
	if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
	echo "✗ inference pods did not all become Ready within 10m"
	kubectl -n "${GATEWAY_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
	exit 1
	fi

	- name: Wait for default APIKey Secret
	run: \|
	set -euo pipefail
	echo "── Waiting for Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
	for _ in $(seq 1 60); do
	if kubectl -n "${GATEWAY_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
	echo "✓ Secret found"
	exit 0
	fi
	sleep 5
	done
	echo "✗ Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m"
	exit 1

	- name: Start gateway port-forward and publish endpoint / API key
	id: endpoint
	run: \|
	set -euo pipefail
	SERVICE="${GATEWAY_NAME}-istio"

	# Map the apikey-authz subdomain to localhost so every request
	# going through the kubectl port-forward carries the correct
	# Host header (`<namespace>.gw.example.com`) without having to
	# hand-craft it for every client (curl, guidellm/httpx).
	if ! grep -qE "[[:space:]]${GATEWAY_HOSTNAME}([[:space:]]\|$)" /etc/hosts; then
	echo "127.0.0.1 ${GATEWAY_HOSTNAME}" \| sudo tee -a /etc/hosts >/dev/null
	echo "✓ added ${GATEWAY_HOSTNAME} → 127.0.0.1 to /etc/hosts"
	fi

	# Start kubectl port-forward in the background. Output goes to a
	# log file so the workflow can dump it on failure.
	mkdir -p /tmp/benchmark-pf
	nohup kubectl -n "${GATEWAY_NAMESPACE}" port-forward \
	"svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
	> /tmp/benchmark-pf/port-forward.log 2>&1 &
	PF_PID=$!
	echo "${PF_PID}" > /tmp/benchmark-pf/pid
	echo "started kubectl port-forward (pid=${PF_PID})"

	# Wait for the local port to accept TCP connections.
	for _ in $(seq 1 30); do
	if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
	echo "✓ port-forward ready on localhost:${GATEWAY_LOCAL_PORT}"
	break
	fi
	sleep 1
	done
	if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
	echo "✗ port-forward did not become ready"
	cat /tmp/benchmark-pf/port-forward.log \|\| true
	exit 1
	fi

	API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE}" get secret \
	"${APIKEY_SECRET_NAME}" \
	-o jsonpath="{.data.${APIKEY_SECRET_KEY}}" \| base64 -d)"
	# Use the apikey-authz hostname in the URL itself — with the
	# /etc/hosts entry above this still resolves to 127.0.0.1 and
	# hits the kubectl port-forward, but curl/httpx now set
	# `Host: ${GATEWAY_HOSTNAME}` automatically.
	ENDPOINT="http://${GATEWAY_HOSTNAME}:${GATEWAY_LOCAL_PORT}"

	echo ""
	echo "════════════════════════════════════════════════════════════════════"
	echo "ModelDeployment endpoint ready"
	echo " endpoint : ${ENDPOINT}"
	echo " model : ${DEPLOYMENT_NAME}"
	echo " api key : ${API_KEY}"
	echo ""
	echo " Example smoke test:"
	echo " curl -sS -H \"Authorization: Bearer ${API_KEY}\" \\"
	echo " -H 'Content-Type: application/json' \\"
	echo " -d '{\"model\":\"${DEPLOYMENT_NAME}\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}' \\"
	echo " ${ENDPOINT}/v1/chat/completions"
	echo "════════════════════════════════════════════════════════════════════"
	echo ""

	# Mask the API key in subsequent log output.
	echo "::add-mask::${API_KEY}"

	# Publish to later steps (the literal value is masked above).
	{
	echo "endpoint=${ENDPOINT}"
	echo "api_key=${API_KEY}"
	} >> "${GITHUB_OUTPUT}"

	- name: Smoke-test the endpoint
	env:
	ENDPOINT: ${{ steps.endpoint.outputs.endpoint }}
	API_KEY: ${{ steps.endpoint.outputs.api_key }}
	MODEL: ${{ env.DEPLOYMENT_NAME }}
	run: \|
	set -euo pipefail
	echo "── Sending a single OpenAI-format request to verify the path Gateway → BBR → EPP → pod ──"
	curl -sS --fail-with-body \
	-H "Authorization: Bearer ${API_KEY}" \
	-H 'Content-Type: application/json' \
	-d "{\"model\":\"${MODEL}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":16}" \
	"${ENDPOINT}/v1/chat/completions" \| tee /tmp/benchmark-pf/smoke.json
	echo ""

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.12'

	- name: Install guidellm
	run: \|
	set -euo pipefail
	python -m pip install --upgrade pip
	# Mirrors the version pattern used by KAITO's vLLM benchmark
	# entrypoint (https://github.com/kaito-project/kaito/blob/main/presets/workspace/inference/vllm/benchmark_entrypoint.py).
	pip install 'guidellm[recommended]'
	guidellm --version

	- name: Run guidellm benchmark
	env:
	ENDPOINT: ${{ steps.endpoint.outputs.endpoint }}
	API_KEY: ${{ steps.endpoint.outputs.api_key }}
	MODEL: ${{ env.DEPLOYMENT_NAME }}
	run: \|
	set -euo pipefail
	mkdir -p ./benchmark-results
	echo "── Running guidellm against ${ENDPOINT} (model=${MODEL}) ──"
	echo " profile=${BENCHMARK_PROFILE} max-seconds=${BENCHMARK_MAX_SECONDS} data=${BENCHMARK_DATA}"
	echo ""

	# `--backend-kwargs` carries the OpenAI bearer token used by the
	# llm-gateway-apikey AuthorizationPolicy guarding the
	# `inference-gateway`. The console output below contains the
	# TTFT, TPOT (per-output-token latency) and request/token
	# throughput tables that satisfy the "TTFT / TPOT / TPM" report
	# required by the workflow spec.
	guidellm benchmark run \
	--target "${ENDPOINT}" \
	--model "${MODEL}" \
	--backend-kwargs "{\"api_key\":\"${API_KEY}\",\"verify\":false}" \
	--profile "${BENCHMARK_PROFILE}" \
	--max-seconds "${BENCHMARK_MAX_SECONDS}" \
	--data "${BENCHMARK_DATA}" \
	--output-dir ./benchmark-results \
	--outputs json \
	--outputs csv \
	--disable-console-interactive

	echo ""
	echo "── benchmark-results contents ──"
	ls -al ./benchmark-results \|\| true

	- name: Scrape vLLM pod metrics
	if: always()
	run: \|
	set -euo pipefail
	mkdir -p ./benchmark-results/vllm-metrics

	echo "── Scraping vLLM /metrics from each inference pod (port ${MODEL_SERVER_PORT}) ──"
	echo " Cross-checks the guidellm summary (TTFT / TPOT / throughput) against the"
	echo " authoritative numbers reported by the vLLM (mock) model server itself."
	echo ""

	PODS="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
	-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
	-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"

	if [ -z "${PODS}" ]; then
	echo "✗ no inference pods found for ${DEPLOYMENT_NAME}"
	exit 0
	fi

	# Metrics of interest. vLLM exposes TTFT / TPOT / e2e latency as
	# Prometheus histograms (the _sum / _count series let us compute
	# accurate averages over the entire benchmark window).
	KEY_METRICS='^(vllm:num_requests_running\|vllm:num_requests_waiting\|vllm:request_success_total\|vllm:request_failure_total\|vllm:prompt_tokens_total\|vllm:generation_tokens_total\|vllm:time_to_first_token_seconds_(sum\|count)\|vllm:time_per_output_token_seconds_(sum\|count)\|vllm:e2e_request_latency_seconds_(sum\|count)\|vllm:request_prompt_tokens_(sum\|count)\|vllm:request_generation_tokens_(sum\|count)\|vllm:kv_cache_usage_perc\|vllm:gpu_cache_usage_perc)( \|\\{)'

	for POD in ${PODS}; do
	RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
	echo "── ${POD} ──"
	if ! kubectl -n "${GATEWAY_NAMESPACE}" get \
	--raw "/api/v1/namespaces/${GATEWAY_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
	> "${RAW}" 2>/tmp/scrape.err; then
	echo " ✗ failed to scrape /metrics:"
	sed 's/^/ /' /tmp/scrape.err \|\| true
	continue
	fi

	echo " raw metrics saved to ${RAW} ($(wc -l < "${RAW}") lines)"
	echo ""
	echo " key vllm:* series:"
	grep -E "${KEY_METRICS}" "${RAW}" \| sed 's/^/ /' \|\| echo " (no matching series found)"
	echo ""

	# Derive averages from the histogram _sum / _count pairs so the
	# workflow log carries a directly comparable TTFT / TPOT / e2e
	# latency next to guidellm's own report. Implemented in awk to
	# avoid a python heredoc (the YAML block-scalar indentation
	# would prevent the closing tag from being recognised).
	awk -v pod="${POD}" '
	/^[#]/ { next }
	/^[[:space:]]*$/ { next }
	{
	# strip optional {label=...} block before the value
	name = $1
	sub(/\{.*\}/, "", name)
	val = $NF + 0
	tot[name] += val
	}
	END {
	printf(" vLLM-derived summary for pod %s:\n", pod)
	printf(" requests succeeded : %d\n", tot["vllm:request_success_total"])
	printf(" requests failed : %d\n", tot["vllm:request_failure_total"])
	printf(" prompt tokens total : %d\n", tot["vllm:prompt_tokens_total"])
	printf(" generation tokens total : %d\n", tot["vllm:generation_tokens_total"])
	ttft_n = tot["vllm:time_to_first_token_seconds_count"]
	tpot_n = tot["vllm:time_per_output_token_seconds_count"]
	e2e_n = tot["vllm:e2e_request_latency_seconds_count"]
	if (ttft_n > 0) printf(" avg TTFT (time-to-first-tok) : %.2f ms\n", 1000 * tot["vllm:time_to_first_token_seconds_sum"] / ttft_n); else print " avg TTFT (time-to-first-tok) : n/a"
	if (tpot_n > 0) printf(" avg TPOT (per-output-token) : %.2f ms\n", 1000 * tot["vllm:time_per_output_token_seconds_sum"] / tpot_n); else print " avg TPOT (per-output-token) : n/a"
	if (e2e_n > 0) printf(" avg e2e (request latency) : %.2f ms\n", 1000 * tot["vllm:e2e_request_latency_seconds_sum"] / e2e_n); else print " avg e2e (request latency) : n/a"
	}
	' "${RAW}" \|\| true
	echo ""
	done

	- name: Upload guidellm results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: guidellm-benchmark-results-${{ github.run_id }}
	path: ./benchmark-results
	if-no-files-found: warn

	- name: Stop gateway port-forward
	if: always()
	run: \|
	set +e
	if [ -f /tmp/benchmark-pf/pid ]; then
	PF_PID="$(cat /tmp/benchmark-pf/pid)"
	echo "stopping kubectl port-forward (pid=${PF_PID})"
	kill "${PF_PID}" 2>/dev/null \|\| true
	fi
	if [ -f /tmp/benchmark-pf/port-forward.log ]; then
	echo "── port-forward log tail ──"
	tail -n 50 /tmp/benchmark-pf/port-forward.log \|\| true
	fi

	- name: Uninstall modeldeployment Helm release
	if: always()
	run: \|
	helm uninstall "${DEPLOYMENT_NAME}" \
	--namespace "${GATEWAY_NAMESPACE}" \
	--ignore-not-found --wait \|\| true

	- name: Dump cluster state
	if: failure()
	run: make e2e-dump

	- name: Teardown cluster
	if: always()
	run: make e2e-teardown
	env:
	RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add benchmark workflow #2

Workflow file

feat: add benchmark workflow #2

Uh oh!

Workflow file for this run