feat: add modelharness helm chart #11

Workflow file for this run

.github/workflows/benchmark.yaml at 56faa5e

	name: ModelDeployment benchmark

	# Workflow that:
	# 1. Reuses the E2E base setup composite action to bring up the AKS
	# cluster + all stack components (Istio gateway, BBR, EPP, KEDA,
	# llm-gateway-apikey, gpu-node-mocker, KAITO).
	# 2. Installs the modeldeployment Helm chart in the `default` namespace,
	# reusing the cluster-wide `inference-gateway` Gateway and the
	# cluster-wide `default` APIKey (Secret `llm-api-key`).
	# 3. Starts a `kubectl port-forward` against the gateway service so the
	# complete request path Gateway → BBR → EPP → vLLM (mock) pod can be
	# exercised from localhost. The endpoint URL and API key are echoed
	# to the workflow log.
	# 4. Installs guidellm and runs a sweep benchmark against the endpoint,
	# printing TTFT / TPOT / TPM (and the rest of the guidellm console
	# summary) into the workflow log.
	#
	# All component versions and benchmark parameters are hard-coded in the
	# `env:` block below — there are no workflow_dispatch inputs. To change a
	# value, edit this file (and rely on the pull_request trigger to validate
	# the change end-to-end).

	on:
	# Run on PRs that touch the benchmark workflow or anything it exercises
	# (modeldeployment chart, gpu-node-mocker, e2e composite action) so the
	# full path Gateway → BBR → EPP → vLLM (mock) pod can be debugged from
	# a PR.
	pull_request:
	branches: [ main ]
	paths:
	- '.github/workflows/benchmark.yaml'
	- '.github/actions/e2e-base-setup/**'
	- 'charts/modeldeployment/**'
	- 'cmd/gpu-node-mocker/**'
	- 'pkg/gpu-node-mocker/**'
	- 'docker/Dockerfile'
	- 'versions.env'
	workflow_dispatch:

	env:
	RESOURCE_GROUP: "kaito-gw-bench-rg-${{ github.run_id }}"
	CLUSTER_NAME: "kaito-gw-bench-aks-${{ github.run_id }}"
	ACR_NAME: "kaitogwbenchaks${{ github.run_id }}acr"
	GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
	LOCATION: swedencentral
	NODE_COUNT: '3'
	NODE_VM_SIZE: Standard_D8s_v5
	GATEWAY_NAME: inference-gateway
	GATEWAY_NAMESPACE: default
	GATEWAY_LOCAL_PORT: '18080'
	# The llm-gateway-apikey ext_authz filter resolves the APIKey CR's
	# namespace from the request's Host header subdomain (
	# `<namespace>.gw.kaito.sh`). Sending requests to `localhost:<port>`
	# makes the filter reject them with 401 + body
	# "cannot determine gateway namespace: set context_extensions[gateway-namespace]
	# or use subdomain-based host". We map this FQDN to 127.0.0.1 in
	# /etc/hosts so the kubectl port-forward target carries the correct Host.
	GATEWAY_HOST: default.gw.kaito.sh
	APIKEY_SECRET_NAME: llm-api-key
	APIKEY_SECRET_KEY: apiKey
	# ----- Benchmark parameters (previously workflow_dispatch inputs) -----
	# KAITO inference preset to benchmark (must exist in KAITO main HEAD
	# model_catalog.yaml).
	MODEL_PRESET: 'phi-4-mini-instruct'
	# Helm release / InferenceSet name. Also the value of the `model` field
	# in OpenAI requests (matched by the HTTPRoute as X-Gateway-Model-Name).
	DEPLOYMENT_NAME: 'benchmark-phi'
	# VM instance type passed to the InferenceSet (must be a SKU the
	# gpu-node-mocker labels as a GPU node).
	INSTANCE_TYPE: 'Standard_NV36ads_A10_v5'
	# guidellm benchmark profile (sweep / throughput / concurrent /
	# constant / poisson / synchronous).
	#
	# `concurrent` runs a single benchmark point with a fixed number of
	# in-flight streams (set via BENCHMARK_RATE), so the total runtime is
	# bounded by BENCHMARK_MAX_SECONDS instead of being multiplied by the
	# ~10 sub-benchmarks `sweep` would produce. With concurrent streams >1
	# guidellm still records the full TTFT / TPOT / e2e latency histograms,
	# so the report contains the same headline numbers as a sweep.
	BENCHMARK_PROFILE: 'concurrent'
	# For `concurrent` / `constant` profiles this is the number of
	# concurrent streams (or requests/sec for `constant`). Ignored by
	# `sweep` / `throughput` / `synchronous`.
	#
	# Capped at 3 to under the maximum concurrency(5) the vLLM (mock) pod is
	# provisioned for — going above this stops producing meaningful TTFT /
	# TPOT numbers because requests start queueing on the server side.
	BENCHMARK_RATE: '3'
	# Maximum seconds per guidellm sub-benchmark. With BENCHMARK_PROFILE=
	# `concurrent` there is exactly one sub-benchmark, so this also bounds
	# the total guidellm runtime. Keep this small enough that the whole
	# "Run benchmark through Endpoint and API key" step (pip install +
	# backend validation + benchmark + result serialization) finishes in
	# under 2 minutes.
	BENCHMARK_MAX_SECONDS: '60'
	# guidellm --data spec (synthetic data config or HF dataset).
	BENCHMARK_DATA: 'prompt_tokens=512,output_tokens=128'
	# HuggingFace model id used by guidellm as the tokenizer/processor when
	# generating synthetic prompts from BENCHMARK_DATA. Must be a real HF
	# repo id — the OpenAI `model` field sent on the wire (DEPLOYMENT_NAME,
	# e.g. `benchmark-phi`) is a gateway-routing name and is not a valid
	# HF identifier, so guidellm cannot fall back to it. Should match the
	# tokenizer of MODEL_PRESET above (phi-4-mini-instruct →
	# microsoft/Phi-4-mini-instruct).
	BENCHMARK_PROCESSOR: 'microsoft/Phi-4-mini-instruct'
	# The benchmark always runs against a single InferenceSet replica so the
	# numbers from guidellm and the per-pod vLLM metrics describe the same
	# backend.
	REPLICAS: '1'
	# Port the vLLM (mock) model server listens on inside each inference pod.
	# Matches `epp.modelServerPort` in charts/modeldeployment/values.yaml
	# (KAITO PortInferenceServer = 5000) and is the same port that exposes
	# the Prometheus `/metrics` endpoint with the `vllm:*` series.
	MODEL_SERVER_PORT: '5000'

	permissions:
	contents: read

	jobs:
	benchmark:
	runs-on: [ "self-hosted", "hostname:kaito-e2e-github-runner" ]
	environment: e2e-test
	permissions:
	contents: read

	steps:
	- name: Checkout Repository
	uses: actions/checkout@v6
	with:
	ref: ${{ github.ref }}

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.12'

	- name: Setup AKS and Kaito Stack
	uses: ./.github/actions/e2e-base-setup
	with:
	resource-group: ${{ env.RESOURCE_GROUP }}
	cluster-name: ${{ env.CLUSTER_NAME }}
	acr-name: ${{ env.ACR_NAME }}
	gpu-mocker-image: ${{ env.GPU_MOCKER_IMAGE }}
	location: ${{ env.LOCATION }}
	node-count: ${{ env.NODE_COUNT }}
	node-vm-size: ${{ env.NODE_VM_SIZE }}
	# All component versions fall through to the defaults baked into
	# versions.env (consumed by the composite action).
	istio-version: ''
	gateway-api-version: ''
	bbr-version: ''
	keda-version: ''
	keda-kaito-scaler-version: ''
	llm-gateway-auth-version: ''

	- name: Deploy inference workload via ModelDeployment
	run: \|
	set -euo pipefail
	echo "── Installing modeldeployment chart ──"
	echo " release / name : ${DEPLOYMENT_NAME}"
	echo " namespace : ${GATEWAY_NAMESPACE}"
	echo " preset model : ${MODEL_PRESET}"
	echo " replicas : ${REPLICAS}"
	echo " instance type : ${INSTANCE_TYPE}"
	echo " gateway : ${GATEWAY_NAME} (namespace ${GATEWAY_NAMESPACE})"

	helm upgrade --install "${DEPLOYMENT_NAME}" charts/modeldeployment \
	--namespace "${GATEWAY_NAMESPACE}" \
	--create-namespace \
	--set name="${DEPLOYMENT_NAME}" \
	--set namespace="${GATEWAY_NAMESPACE}" \
	--set model="${MODEL_PRESET}" \
	--set replicas="${REPLICAS}" \
	--set instanceType="${INSTANCE_TYPE}" \
	--set gatewayName="${GATEWAY_NAME}" \
	--wait --timeout=5m

	- name: Wait for inference workload endpoint and API key
	id: endpoint
	run: \|
	set -euo pipefail

	# ---------- 1. Wait for inference pods to become Ready ----------
	echo "── Waiting for InferenceSet ${DEPLOYMENT_NAME} pods to be Ready (timeout 10m) ──"

	# Wait for the EPP deployment first (ext_proc gRPC endpoint that the Gateway reaches).
	kubectl -n "${GATEWAY_NAMESPACE}" rollout status \
	"deployment/${DEPLOYMENT_NAME}-inferencepool-epp" --timeout=5m

	# Wait for the inference (shadow) pods. KAITO labels each
	# inference pod with `inferenceset.kaito.sh/created-by=<name>`.
	deadline=$(( $(date +%s) + 600 ))
	ready=0
	while [ "$(date +%s)" -lt "${deadline}" ]; do
	ready=$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
	-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
	-o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
	2>/dev/null \| grep -c '^True$' \|\| true)
	echo " ready pods: ${ready}/${REPLICAS}"
	if [ "${ready}" -ge "${REPLICAS}" ]; then
	echo "✓ all ${REPLICAS} inference pods are Ready"
	break
	fi
	sleep 10
	done
	if [ "${ready:-0}" -lt "${REPLICAS}" ]; then
	echo "✗ inference pods did not all become Ready within 10m"
	kubectl -n "${GATEWAY_NAMESPACE}" get pods -l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" -o wide
	exit 1
	fi

	# ---------- 2. Wait for the default APIKey Secret ----------
	echo "── Waiting for Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} (created by the apikey-operator from the cluster-wide APIKey CR) ──"
	secret_found=0
	for _ in $(seq 1 60); do
	if kubectl -n "${GATEWAY_NAMESPACE}" get secret "${APIKEY_SECRET_NAME}" >/dev/null 2>&1; then
	echo "✓ Secret found"
	secret_found=1
	break
	fi
	sleep 5
	done
	if [ "${secret_found}" -ne 1 ]; then
	echo "✗ Secret ${GATEWAY_NAMESPACE}/${APIKEY_SECRET_NAME} did not appear within 5m"
	exit 1
	fi

	# ---------- 3. Start gateway port-forward ----------
	SERVICE="${GATEWAY_NAME}-istio"

	# Map the subdomain-based gateway host to the loopback address
	# so curl / guidellm send `Host: ${GATEWAY_HOST}` (which the
	# llm-gateway-apikey filter parses to derive the APIKey CR's
	# namespace) while still hitting the local port-forward.
	if ! grep -qE "[[:space:]]${GATEWAY_HOST}([[:space:]]\|$)" /etc/hosts; then
	echo "127.0.0.1 ${GATEWAY_HOST}" \| sudo tee -a /etc/hosts >/dev/null
	echo "added /etc/hosts entry: 127.0.0.1 ${GATEWAY_HOST}"
	fi

	# Use the job-scoped RUNNER_TEMP directory (always writable by the
	# runner user) instead of a shared /tmp path. On self-hosted
	# runners /tmp may contain leftovers from previous jobs owned by
	# a different user, which causes "Permission denied" on writes.
	PF_DIR="${RUNNER_TEMP}/benchmark-pf"

	# Start kubectl port-forward in the background. Output goes to a
	# log file so the workflow can dump it on failure.
	mkdir -p "${PF_DIR}"
	nohup kubectl -n "${GATEWAY_NAMESPACE}" port-forward \
	"svc/${SERVICE}" "${GATEWAY_LOCAL_PORT}:80" \
	> "${PF_DIR}/port-forward.log" 2>&1 &
	PF_PID=$!
	echo "${PF_PID}" > "${PF_DIR}/pid"
	echo "started kubectl port-forward (pid=${PF_PID})"

	# Wait for the local port to accept TCP connections.
	for _ in $(seq 1 30); do
	if (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
	echo "✓ port-forward ready on localhost:${GATEWAY_LOCAL_PORT}"
	break
	fi
	sleep 1
	done
	if ! (echo > "/dev/tcp/127.0.0.1/${GATEWAY_LOCAL_PORT}") 2>/dev/null; then
	echo "✗ port-forward did not become ready"
	cat "${PF_DIR}/port-forward.log" \|\| true
	exit 1
	fi

	API_KEY="$(kubectl -n "${GATEWAY_NAMESPACE}" get secret \
	"${APIKEY_SECRET_NAME}" \
	-o jsonpath="{.data.${APIKEY_SECRET_KEY}}" \| base64 -d)"
	# Use the subdomain-based FQDN (mapped to 127.0.0.1 above) so the
	# request Host header lets the llm-gateway-apikey filter resolve
	# the APIKey CR namespace.
	ENDPOINT="http://${GATEWAY_HOST}:${GATEWAY_LOCAL_PORT}"

	echo ""
	echo "════════════════════════════════════════════════════════════════════"
	echo "ModelDeployment endpoint ready"
	echo " endpoint : ${ENDPOINT}"
	echo " model : ${DEPLOYMENT_NAME}"
	echo " api key : ${API_KEY}"
	echo ""
	echo " Example smoke test:"
	echo " curl -sS -H \"Authorization: Bearer ${API_KEY}\" \\"
	echo " -H 'Content-Type: application/json' \\"
	echo " -d '{\"model\":\"${DEPLOYMENT_NAME}\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}' \\"
	echo " ${ENDPOINT}/v1/chat/completions"
	echo "════════════════════════════════════════════════════════════════════"
	echo ""

	# Mask the API key in subsequent log output.
	echo "::add-mask::${API_KEY}"

	# Publish to later steps (the literal value is masked above).
	{
	echo "endpoint=${ENDPOINT}"
	echo "api_key=${API_KEY}"
	} >> "${GITHUB_OUTPUT}"

	# ---------- 4. Smoke-test the endpoint ----------
	echo "── Sending a single OpenAI-format request to verify the path Gateway → BBR → EPP → pod ──"
	curl -sS --fail-with-body \
	-H "Authorization: Bearer ${API_KEY}" \
	-H 'Content-Type: application/json' \
	-d "{\"model\":\"${DEPLOYMENT_NAME}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":16}" \
	"${ENDPOINT}/v1/chat/completions" \| tee "${PF_DIR}/smoke.json"
	echo ""

	- name: Run benchmark test for inference workload
	env:
	ENDPOINT: ${{ steps.endpoint.outputs.endpoint }}
	API_KEY: ${{ steps.endpoint.outputs.api_key }}
	MODEL: ${{ env.DEPLOYMENT_NAME }}
	run: \|
	set -euo pipefail

	# ---------- 1. Install guidellm ----------
	python -m pip install --upgrade pip
	# Mirrors the version pattern used by KAITO's vLLM benchmark
	# entrypoint (https://github.com/kaito-project/kaito/blob/main/presets/workspace/inference/vllm/benchmark_entrypoint.py).
	pip install 'guidellm[recommended]'
	guidellm --version

	# ---------- 2. Run guidellm benchmark ----------
	mkdir -p ./benchmark-results
	echo "── Running guidellm against ${ENDPOINT} (model=${MODEL}) ──"
	echo " profile=${BENCHMARK_PROFILE} rate=${BENCHMARK_RATE} max-seconds=${BENCHMARK_MAX_SECONDS} data=${BENCHMARK_DATA}"
	echo " processor=${BENCHMARK_PROCESSOR}"
	echo ""

	# `--backend-kwargs` carries the OpenAI bearer token used by the
	# llm-gateway-apikey AuthorizationPolicy guarding the
	# `inference-gateway`. The console output below contains the
	# TTFT, TPOT (per-output-token latency) and request/token
	# throughput tables that satisfy the "TTFT / TPOT / TPM" report
	# required by the workflow spec.
	#
	# `validate_backend: false` disables guidellm's startup probe
	# (a `GET /health` against the target). The modeldeployment
	# HTTPRoute only matches requests carrying the
	# `X-Gateway-Model-Name` header and routes by model name via
	# the EPP, so `/health` is not exposed at the gateway and the
	# probe returns 404. The previous "Wait for Inference
	# WorkloadEndpoint and API Key" step already smoke-tested the
	# full path with a real `/v1/chat/completions` request, so
	# skipping guidellm's redundant validation is safe.
	guidellm benchmark run \
	--target "${ENDPOINT}" \
	--model "${MODEL}" \
	--processor "${BENCHMARK_PROCESSOR}" \
	--backend-kwargs "{\"api_key\":\"${API_KEY}\",\"verify\":false,\"validate_backend\":false}" \
	--profile "${BENCHMARK_PROFILE}" \
	--rate "${BENCHMARK_RATE}" \
	--max-seconds "${BENCHMARK_MAX_SECONDS}" \
	--data "${BENCHMARK_DATA}" \
	--output-dir ./benchmark-results \
	--outputs json \
	--outputs csv \
	--disable-console-interactive

	echo ""
	echo "── benchmark-results contents ──"
	ls -al ./benchmark-results \|\| true

	- name: Collect metrics for inference workload
	if: always()
	run: \|
	set -euo pipefail
	mkdir -p ./benchmark-results/vllm-metrics

	# The metrics produced by this step are emitted twice:
	# 1. To stdout — visible in the GitHub Actions job log.
	# 2. To $GITHUB_STEP_SUMMARY — rendered as Markdown on the
	# workflow run summary page (the "Summary" tab) so the
	# headline numbers are visible without scrolling through
	# the raw log. `tee -a "${GITHUB_STEP_SUMMARY}"` is used
	# to do both at once where the content is already
	# Markdown-friendly.
	SUMMARY="${GITHUB_STEP_SUMMARY:-/dev/null}"

	{
	echo "# Benchmark results — \`${DEPLOYMENT_NAME}\`"
	echo ""
	echo "\| field \| value \|"
	echo "\| --- \| --- \|"
	echo "\| model preset \| \`${MODEL_PRESET}\` \|"
	echo "\| deployment \| \`${DEPLOYMENT_NAME}\` \|"
	echo "\| namespace \| \`${GATEWAY_NAMESPACE}\` \|"
	echo "\| replicas \| ${REPLICAS} \|"
	echo "\| instance type \| \`${INSTANCE_TYPE}\` \|"
	echo "\| guidellm profile \| \`${BENCHMARK_PROFILE}\` \|"
	echo "\| guidellm max-seconds \| ${BENCHMARK_MAX_SECONDS} \|"
	echo "\| guidellm data \| \`${BENCHMARK_DATA}\` \|"
	echo ""
	} >> "${SUMMARY}"

	# ---------- 1. Print guidellm benchmark result files ----------
	echo "── guidellm benchmark-results files ──"
	if [ -d ./benchmark-results ]; then
	find ./benchmark-results -maxdepth 2 -type f -printf ' %p (%s bytes)\n' 2>/dev/null \
	\|\| find ./benchmark-results -maxdepth 2 -type f -exec ls -l {} \;
	for f in $(find ./benchmark-results -maxdepth 2 -type f $ -name '.json' -o -name '.csv' -o -name '.txt' -o -name '.md' $ \| sort); do
	echo ""
	echo "─── ${f} ───"
	cat "${f}" \|\| true
	echo ""
	done
	else
	echo " (./benchmark-results does not exist — guidellm probably did not run)"
	fi
	echo ""

	# Append a compact guidellm summary (one row per sub-benchmark)
	# to both the log and the Step Summary. The CSV emitted by
	# `guidellm --outputs csv` already has a row per profile point
	# with TTFT / TPOT / TPM-equivalent columns, so we surface the
	# full file verbatim — it is small and self-explanatory.
	GUIDELLM_CSV="$(find ./benchmark-results -maxdepth 2 -type f -name '*.csv' \| sort \| head -n1 \|\| true)"
	{
	echo "## guidellm summary"
	echo ""
	if [ -n "${GUIDELLM_CSV}" ] && [ -s "${GUIDELLM_CSV}" ]; then
	echo "_Source: \`${GUIDELLM_CSV}\` (TTFT / TPOT / per-second token & request throughput per profile point)._"
	echo ""
	echo '```csv'
	cat "${GUIDELLM_CSV}"
	echo '```'
	else
	echo "_No guidellm CSV produced — the benchmark step probably did not run._"
	fi
	echo ""
	} >> "${SUMMARY}"

	# ---------- 2. Scrape vLLM pod metrics ----------
	echo "── Scraping vLLM /metrics from each inference pod (port ${MODEL_SERVER_PORT}) ──"
	echo " Cross-checks the guidellm summary (TTFT / TPOT / throughput) against the"
	echo " authoritative numbers reported by the vLLM (mock) model server itself."
	echo ""

	# The pods labelled `inferenceset.kaito.sh/created-by=<set>` are
	# the original (placeholder) KAITO inference pods. They are
	# bound to fake GPU nodes managed by gpu-node-mocker and never
	# actually run a kubelet, so the apiserver pod-proxy
	# (/api/v1/.../pods/<name>:<port>/proxy/...) cannot reach them.
	# The vLLM (mock) model server actually runs inside the matching
	# shadow pod (`shadow-<ns>-<name>`) created by gpu-node-mocker
	# on a real node and labelled `kaito.sh/managed-by=gpu-mocker` /
	# `kaito.sh/shadow-pod-for=<ns>.<original-name>`. Resolve each
	# original pod to its shadow before scraping /metrics.
	ORIGINAL_PODS="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
	-l "inferenceset.kaito.sh/created-by=${DEPLOYMENT_NAME}" \
	-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"

	PODS=""
	for ORIG in ${ORIGINAL_PODS}; do
	SHADOW_SELECTOR="kaito.sh/shadow-pod-for=${GATEWAY_NAMESPACE}.${ORIG}"
	SHADOW_POD="$(kubectl -n "${GATEWAY_NAMESPACE}" get pods \
	-l "${SHADOW_SELECTOR}" \
	-o jsonpath='{.items[0].metadata.name}' 2>/dev/null \|\| true)"
	if [ -z "${SHADOW_POD}" ]; then
	echo "⚠️ no shadow pod found for original pod ${ORIG} (selector: ${SHADOW_SELECTOR}) — skipping"
	continue
	fi
	echo " resolved ${ORIG} → ${SHADOW_POD}"
	PODS="${PODS}${SHADOW_POD} "
	done

	{
	echo "## vLLM per-pod metrics"
	echo ""
	} >> "${SUMMARY}"

	if [ -z "${PODS//[[:space:]]/}" ]; then
	echo "✗ no shadow pods found for ${DEPLOYMENT_NAME}"
	echo "_No shadow pods were found for \`${DEPLOYMENT_NAME}\`._" >> "${SUMMARY}"
	exit 0
	fi

	{
	echo "\| pod \| requests succeeded \| requests failed \| prompt tokens \| generation tokens \| avg TTFT (ms) \| avg TPOT (ms) \| avg e2e (ms) \|"
	echo "\| --- \| ---: \| ---: \| ---: \| ---: \| ---: \| ---: \| ---: \|"
	} >> "${SUMMARY}"

	# Metrics of interest. vLLM exposes TTFT / TPOT / e2e latency as
	# Prometheus histograms (the _sum / _count series let us compute
	# accurate averages over the entire benchmark window).
	KEY_METRICS='^(vllm:num_requests_running\|vllm:num_requests_waiting\|vllm:request_success_total\|vllm:request_failure_total\|vllm:prompt_tokens_total\|vllm:generation_tokens_total\|vllm:time_to_first_token_seconds_(sum\|count)\|vllm:time_per_output_token_seconds_(sum\|count)\|vllm:e2e_request_latency_seconds_(sum\|count)\|vllm:request_prompt_tokens_(sum\|count)\|vllm:request_generation_tokens_(sum\|count)\|vllm:kv_cache_usage_perc\|vllm:gpu_cache_usage_perc)( \|\\{)'

	# Use the job-scoped RUNNER_TEMP directory for the per-pod
	# kubectl stderr capture file. On self-hosted runners /tmp may
	# contain leftovers from previous jobs owned by a different
	# user, which causes "Permission denied" when bash tries to
	# open `2>/tmp/scrape.err` — and because the redirection
	# target fails to open, the `kubectl get --raw` command never
	# executes, so every pod is reported as "scrape failed".
	SCRAPE_ERR="${RUNNER_TEMP}/scrape.err"

	for POD in ${PODS}; do
	RAW="./benchmark-results/vllm-metrics/${POD}.metrics.txt"
	echo "── ${POD} ──"
	if ! kubectl -n "${GATEWAY_NAMESPACE}" get \
	--raw "/api/v1/namespaces/${GATEWAY_NAMESPACE}/pods/${POD}:${MODEL_SERVER_PORT}/proxy/metrics" \
	> "${RAW}" 2>"${SCRAPE_ERR}"; then
	echo " ✗ failed to scrape /metrics:"
	sed 's/^/ /' "${SCRAPE_ERR}" \|\| true
	echo "\| \`${POD}\` \| _scrape failed_ \| \| \| \| \| \| \|" >> "${SUMMARY}"
	continue
	fi

	echo " raw metrics saved to ${RAW} ($(wc -l < "${RAW}") lines)"
	echo ""
	echo " key vllm:* series:"
	grep -E "${KEY_METRICS}" "${RAW}" \| sed 's/^/ /' \|\| echo " (no matching series found)"
	echo ""

	# Derive averages from the histogram _sum / _count pairs.
	# Print a human-readable block to stdout AND a single Markdown
	# table row to $GITHUB_STEP_SUMMARY. Implemented in awk to
	# avoid a python heredoc (the YAML block-scalar indentation
	# would prevent the closing tag from being recognised).
	awk -v pod="${POD}" -v summary="${SUMMARY}" '
	/^[#]/ { next }
	/^[[:space:]]*$/ { next }
	{
	# strip optional {label=...} block before the value
	name = $1
	sub(/\{.*\}/, "", name)
	val = $NF + 0
	tot[name] += val
	}
	END {
	ok = tot["vllm:request_success_total"]
	fail = tot["vllm:request_failure_total"]
	ptoks = tot["vllm:prompt_tokens_total"]
	gtoks = tot["vllm:generation_tokens_total"]
	ttft_n = tot["vllm:time_to_first_token_seconds_count"]
	tpot_n = tot["vllm:time_per_output_token_seconds_count"]
	e2e_n = tot["vllm:e2e_request_latency_seconds_count"]
	ttft_ms = (ttft_n > 0) ? 1000 * tot["vllm:time_to_first_token_seconds_sum"] / ttft_n : -1
	tpot_ms = (tpot_n > 0) ? 1000 * tot["vllm:time_per_output_token_seconds_sum"] / tpot_n : -1
	e2e_ms = (e2e_n > 0) ? 1000 * tot["vllm:e2e_request_latency_seconds_sum"] / e2e_n : -1

	# Stdout (workflow log) — human-readable block.
	printf(" vLLM-derived summary for pod %s:\n", pod)
	printf(" requests succeeded : %d\n", ok)
	printf(" requests failed : %d\n", fail)
	printf(" prompt tokens total : %d\n", ptoks)
	printf(" generation tokens total : %d\n", gtoks)
	if (ttft_ms >= 0) printf(" avg TTFT (time-to-first-tok) : %.2f ms\n", ttft_ms); else print " avg TTFT (time-to-first-tok) : n/a"
	if (tpot_ms >= 0) printf(" avg TPOT (per-output-token) : %.2f ms\n", tpot_ms); else print " avg TPOT (per-output-token) : n/a"
	if (e2e_ms >= 0) printf(" avg e2e (request latency) : %.2f ms\n", e2e_ms); else print " avg e2e (request latency) : n/a"

	# GITHUB_STEP_SUMMARY — single Markdown table row.
	ttft_s = (ttft_ms >= 0) ? sprintf("%.2f", ttft_ms) : "n/a"
	tpot_s = (tpot_ms >= 0) ? sprintf("%.2f", tpot_ms) : "n/a"
	e2e_s = (e2e_ms >= 0) ? sprintf("%.2f", e2e_ms ) : "n/a"
	printf("\| `%s` \| %d \| %d \| %d \| %d \| %s \| %s \| %s \|\n",
	pod, ok, fail, ptoks, gtoks, ttft_s, tpot_s, e2e_s) >> summary
	}
	' "${RAW}" \|\| true
	echo ""
	done

	{
	echo ""
	echo "_Raw \`/metrics\` dumps for every pod are saved under \`./benchmark-results/vllm-metrics/\` on the runner._"
	} >> "${SUMMARY}"

	- name: Cleanup (stop port-forward, uninstall release, dump state on failure, teardown cluster)
	if: always()
	env:
	RESOURCE_GROUP: ${{ env.RESOURCE_GROUP }}
	JOB_STATUS: ${{ job.status }}
	run: \|
	set +e

	# ---------- 1. Stop gateway port-forward ----------
	PF_DIR="${RUNNER_TEMP}/benchmark-pf"
	if [ -f "${PF_DIR}/pid" ]; then
	PF_PID="$(cat "${PF_DIR}/pid")"
	echo "stopping kubectl port-forward (pid=${PF_PID})"
	kill "${PF_PID}" 2>/dev/null \|\| true
	fi
	if [ -f "${PF_DIR}/port-forward.log" ]; then
	echo "── port-forward log tail ──"
	tail -n 50 "${PF_DIR}/port-forward.log" \|\| true
	fi

	# ---------- 2. Uninstall modeldeployment Helm release ----------
	helm uninstall "${DEPLOYMENT_NAME}" \
	--namespace "${GATEWAY_NAMESPACE}" \
	--ignore-not-found --wait \|\| true

	# ---------- 3. Dump cluster state on failure ----------
	if [ "${JOB_STATUS}" = "failure" ]; then
	echo "── Job failed — dumping cluster state ──"
	make e2e-dump \|\| true
	fi

	# ---------- 4. Teardown cluster ----------
	make e2e-teardown \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add modelharness helm chart #11

Workflow file

feat: add modelharness helm chart #11

Uh oh!

Workflow file for this run