Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219

Workflow file for this run

.github/workflows/ci-benchmark.yaml at 000a7d1

	name: CI - Benchmark
	run-name: >-
	${{ github.event_name == 'workflow_dispatch'
	&& format('Benchmark: {0} \| {1} \| {2}',
	inputs.platform,
	inputs.model_id \|\| 'unsloth/Meta-Llama-3.1-8B',
	github.ref_name)
	\|\| format('Benchmark: PR #{0} \| {1}',
	github.event.issue.number,
	github.event.comment.body) }}

	concurrency:
	group: >-
	${{
	github.event_name == 'issue_comment' &&
	!contains(github.event.comment.body, '/benchmark kind') &&
	!contains(github.event.comment.body, '/benchmark openshift')
	&& format('benchmark-isolated-{0}', github.run_id)
	\|\| format('benchmark-{0}',
	github.event.issue.number
	\|\| github.run_id)
	}}
	cancel-in-progress: true

	on:
	issue_comment:
	types: [created]
	workflow_dispatch:
	inputs:
	platform:
	description: 'Platform: kind or openshift'
	required: true
	default: 'kind'
	type: choice
	options: [kind, openshift]
	model_id:
	description: 'Model to benchmark (HuggingFace ID)'
	required: false
	default: 'unsloth/Meta-Llama-3.1-8B'
	type: string

	jobs:
	gate:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	pull-requests: write
	outputs:
	run_benchmark: ${{ steps.check.outputs.run_benchmark }}
	platform: ${{ steps.check.outputs.platform }}
	pr_number: ${{ steps.check.outputs.pr_number }}
	pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
	pr_head_repo: ${{ steps.check.outputs.pr_head_repo }}
	steps:
	- name: Check if benchmark requested
	id: check
	uses: actions/github-script@v7
	with:
	script: \|
	async function hasWriteAccess(username) {
	try {
	const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	username: username
	});
	const privilegedRoles = ['admin', 'maintain', 'write'];
	return privilegedRoles.includes(permission.permission);
	} catch (e) {
	console.log(`Could not get permissions for ${username}: ${e.message}`);
	return false;
	}
	}

	if (context.eventName !== 'issue_comment' && context.eventName !== 'workflow_dispatch') {
	core.setOutput('run_benchmark', 'false');
	return;
	}

	if (context.eventName === 'workflow_dispatch') {
	const platform = context.payload.inputs.platform;
	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	console.log(`Manual benchmark dispatch for ${platform}`);

	core.setOutput('run_benchmark', 'true');
	core.setOutput('platform', platform);

	// Try to find a PR for the current branch so we can post results
	const branch = context.ref.replace('refs/heads/', '');
	const { data: prs } = await github.rest.pulls.list({
	owner: context.repo.owner,
	repo: context.repo.repo,
	head: `${context.repo.owner}:${branch}`,
	state: 'open',
	});
	if (prs.length > 0) {
	core.setOutput('pr_number', prs[0].number.toString());
	core.setOutput('pr_head_sha', prs[0].head.sha);
	console.log(`Found open PR #${prs[0].number} for branch ${branch}`);
	} else {
	console.log(`No open PR found for branch ${branch}, skipping PR outputs`);
	}
	return;
	}

	const comment = context.payload.comment.body.trim();
	const issue = context.payload.issue;

	if (!issue.pull_request) {
	console.log('Comment is not on a PR, skipping');
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const validCommands = ['/benchmark kind', '/benchmark openshift'];
	if (!validCommands.includes(comment)) {
	console.log(`Comment "${comment}" is not a valid benchmark command, skipping`);
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const commenter = context.payload.comment.user.login;
	const hasAccess = await hasWriteAccess(commenter);
	if (!hasAccess) {
	console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: issue.number
	});

	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;

	console.log(`/benchmark kind approved by ${commenter} for PR #${issue.number}`);
	console.log(`PR head SHA: ${pr.head.sha}`);

	await github.rest.reactions.createForIssueComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: context.payload.comment.id,
	content: 'rocket'
	});

	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	const platform = comment.includes('openshift') ? 'OpenShift' : 'Kind';
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: issue.number,
	body: `🚀 Benchmark (${platform}) triggered by \`${comment}\`\n\n[View the benchmark workflow run](${runUrl})`
	});

	core.setOutput('run_benchmark', 'true');
	core.setOutput('platform', platform.toLowerCase());
	core.setOutput('pr_number', issue.number.toString());
	core.setOutput('pr_head_sha', pr.head.sha);
	core.setOutput('pr_head_repo', headRepo);

	build-image:
	needs: gate
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' \|\| github.event.inputs.platform == 'openshift')
	runs-on: ubuntu-latest
	outputs:
	image_tag: ${{ steps.build.outputs.image_tag }}
	steps:
	- name: Checkout source
	uses: actions/checkout@v4
	with:
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Log in to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ secrets.CR_USER }}
	password: ${{ secrets.CR_TOKEN }}

	- name: Build and push image
	id: build
	env:
	REGISTRY: ghcr.io
	IMAGE_NAME: ${{ github.repository }}
	GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	IMAGE_TAG="bench-$(printf '%s' "$GIT_REF" \| cut -c1-8)"
	FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
	echo "Building image: $FULL_IMAGE"

	make docker-build IMG="$FULL_IMAGE"
	make docker-push IMG="$FULL_IMAGE"

	echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT

	benchmark-kind:
	runs-on: ubuntu-latest
	needs: [gate]
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'kind' \|\| github.event.inputs.platform == 'kind')
	timeout-minutes: 45
	permissions:
	contents: write
	statuses: write
	pull-requests: write
	actions: read
	steps:
	- name: Set pending status on PR head
	if: github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: '${{ needs.gate.outputs.pr_head_sha }}',
	state: 'pending',
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: 'Benchmark running...',
	context: '${{ github.workflow }} / benchmark-kind'
	});

	- name: Validate PR head SHA
	if: github.event_name == 'issue_comment'
	run: \|
	if [ -z "${{ needs.gate.outputs.pr_head_sha }}" ]; then
	echo "::error::pr_head_sha is empty — refusing to fall back to main"
	exit 1
	fi
	echo "Checkout will use PR head SHA: ${{ needs.gate.outputs.pr_head_sha }}"

	- name: Checkout source
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.gate.outputs.pr_head_repo \|\| github.repository }}
	ref: ${{ needs.gate.outputs.pr_head_sha \|\| github.sha }}
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Extract Go version from go.mod
	run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV

	- name: Set up Go with cache
	uses: actions/setup-go@v6
	with:
	go-version: "${{ env.GO_VERSION }}"
	cache-dependency-path: ./go.sum

	- name: Install dependencies
	run: go mod download

	- name: Install Kind
	run: \|
	ARCH=$(uname -m)
	case "$ARCH" in
	x86_64) KIND_ARCH="amd64" ;;
	aarch64) KIND_ARCH="arm64" ;;
	*) echo "Unsupported architecture: $ARCH"; exit 1 ;;
	esac
	curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.25.0/kind-linux-${KIND_ARCH}"
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind
	kind version

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Build WVA image locally
	id: build-image
	env:
	CHECKOUT_SHA: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	IMAGE_NAME="llm-d-workload-variant-autoscaler"
	IMAGE_TAG="bench-${CHECKOUT_SHA:0:7}"
	FULL_IMAGE="localhost/${IMAGE_NAME}:${IMAGE_TAG}"
	echo "Building local image: $FULL_IMAGE"
	make docker-build IMG="$FULL_IMAGE"
	echo "image=$FULL_IMAGE" >> $GITHUB_OUTPUT

	- name: Deploy e2e infrastructure
	env:
	ENVIRONMENT: kind-emulator
	USE_SIMULATOR: "true"
	CREATE_CLUSTER: "true"
	INSTALL_GATEWAY_CTRLPLANE: "true"
	E2E_TESTS_ENABLED: "true"
	IMG: ${{ steps.build-image.outputs.image }}
	SKIP_BUILD: "true"
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	INSTALL_GRAFANA: "true"
	run: make deploy-e2e-infra

	- name: Run benchmark
	env:
	ENVIRONMENT: kind-emulator
	USE_SIMULATOR: "true"
	SCALER_BACKEND: prometheus-adapter
	BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
	BENCHMARK_GRAFANA_ENABLED: "true"
	BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
	BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
	BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	run: make test-benchmark

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: \|
	/tmp/benchmark-results.json
	/tmp/prefill-benchmark-results.json
	/tmp/benchmark-grafana-snapshot.txt
	/tmp/benchmark-grafana-snapshot.json
	/tmp/benchmark-panels/
	if-no-files-found: warn

	- name: Post benchmark results as PR comment
	if: always() && github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');
	const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
	const sha = '${{ needs.gate.outputs.pr_head_sha }}';
	const runId = context.runId;
	const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;

	// Look up the uploaded artifact to get a direct download link
	let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
	try {
	const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
	owner: context.repo.owner,
	repo: context.repo.repo,
	run_id: runId
	});
	const benchArtifact = artifacts.find(a => a.name === 'benchmark-results');
	if (benchArtifact) {
	artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
	}
	} catch (e) {
	console.log(`Could not look up artifact: ${e.message}`);
	}

	let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';

	try {
	const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));

	const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;

	resultsTable = `\| Metric \| Value \|
	\|--------\|-------\|
	\| Scale-up time \| ${fmtTime(data.scaleUpTimeSec)} \|
	\| Scale-down time \| ${fmtTime(data.scaleDownTimeSec)} \|
	\| Max replicas \| ${data.maxReplicas} \|
	\| Avg KV cache usage \| ${data.avgKVCacheUsage.toFixed(3)} \|
	\| Avg queue depth \| ${data.avgQueueDepth.toFixed(1)} \|
	\| Replica oscillation (σ) \| ${data.replicaOscillation.toFixed(2)} \|
	\| Total duration \| ${data.totalDurationSec.toFixed(0)}s \|`;
	} catch (e) {
	console.log(`Could not read results: ${e.message}`);
	}

	// Upload panel PNGs as release assets and collect URLs for embedding
	let panelImages = '';
	const panelDir = '/tmp/benchmark-panels';
	const hasPanels = fs.existsSync(panelDir) &&
	fs.readdirSync(panelDir).some(f => f.endsWith('.png'));

	if (hasPanels) {
	const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
	const tag = `benchmark-run-${runId}`;

	try {
	// Create a lightweight release to host panel images
	const release = await github.rest.repos.createRelease({
	owner: context.repo.owner,
	repo: context.repo.repo,
	tag_name: tag,
	name: `Benchmark panels (PR #${prNumber}, ${sha.substring(0, 7)})`,
	body: `Auto-generated by benchmark CI run #${runId}`,
	draft: false,
	prerelease: true
	});

	const imageUrls = [];
	for (const png of pngs) {
	const filePath = path.join(panelDir, png);
	const fileData = fs.readFileSync(filePath);
	const asset = await github.rest.repos.uploadReleaseAsset({
	owner: context.repo.owner,
	repo: context.repo.repo,
	release_id: release.data.id,
	name: png,
	data: fileData,
	headers: { 'content-type': 'image/png' }
	});
	const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
	imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
	console.log(`Uploaded ${png}: ${asset.data.browser_download_url}`);
	}

	if (imageUrls.length > 0) {
	panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
	}
	} catch (e) {
	console.log(`Could not upload panel images: ${e.message}`);
	}
	}

	// Check for Grafana snapshot
	const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
	let artifactsSection = '';
	if (hasSnapshotJson \|\| hasPanels) {
	const items = [];
	if (hasSnapshotJson) {
	items.push('Grafana snapshot JSON');
	}
	artifactsSection = `\n\n📎 [Download artifacts](${artifactUrl})${items.length ? ' — ' + items.join(', ') : ''}`;
	}

	const body = `## Benchmark: scale-up-latency (Kind)

	${resultsTable}${panelImages}${artifactsSection}

	<details>
	<summary>Environment</summary>

	- Cluster: Kind (emulated GPUs)
	- Model: unsloth/Meta-Llama-3.1-8B (simulator)
	- Commit: ${sha.substring(0, 7)}
	- Scaler: prometheus-adapter
	- [Workflow run](${repoUrl}/actions/runs/${runId})

	</details>`;

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});

	- name: Cleanup Kind cluster
	if: always()
	run: kind delete cluster --name kind-wva-gpu-cluster \|\| true

	benchmark-openshift:
	runs-on: [self-hosted, openshift, vllm-d]
	needs: [gate, build-image]
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' \|\| github.event.inputs.platform == 'openshift')
	timeout-minutes: 60
	permissions:
	contents: write
	statuses: write
	pull-requests: write
	actions: read
	env:
	MODEL_ID: ${{ inputs.model_id \|\| 'unsloth/Meta-Llama-3.1-8B' }}
	ACCELERATOR_TYPE: 'H100'
	GOTOOLCHAIN: auto
	LLMD_NAMESPACE: llm-d-benchmark-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	WVA_NAMESPACE: wva-benchmark-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	WVA_RELEASE_NAME: wva-bench-${{ github.run_id }}
	WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
	steps:
	- name: Set pending status on PR head
	if: github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: '${{ needs.gate.outputs.pr_head_sha }}',
	state: 'pending',
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: 'Benchmark running on OpenShift...',
	context: '${{ github.workflow }} / benchmark-openshift'
	});

	- name: Checkout source
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.gate.outputs.pr_head_repo \|\| github.repository }}
	ref: ${{ needs.gate.outputs.pr_head_sha \|\| github.sha }}
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Set up Go
	uses: actions/setup-go@v6
	with:
	go-version: "1.25.x"
	cache-dependency-path: ./go.sum

	- name: Install tools (kubectl, oc, helm, make)
	run: \|
	sudo apt-get update && sudo apt-get install -y make
	KUBECTL_VERSION="v1.31.0"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
	tar -xzf openshift-client-linux.tar.gz
	sudo mv oc /usr/local/bin/
	rm -f openshift-client-linux.tar.gz kubectl README.md
	curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	- name: Verify cluster access
	run: \|
	kubectl cluster-info
	kubectl get nodes

	- name: Get HF token from cluster secret
	id: hf-token
	run: \|
	HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' \| base64 -d)
	echo "::add-mask::$HF_TOKEN"
	echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV

	- name: Clean up resources for this PR
	run: \|
	for ns in "$LLMD_NAMESPACE" "$WVA_NAMESPACE"; do
	if kubectl get namespace "$ns" &>/dev/null; then
	kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	for release in $(helm list -n "$ns" -q 2>/dev/null); do
	helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s \|\| true
	done
	kubectl delete namespace "$ns" --ignore-not-found --timeout=60s \|\| true
	fi
	done

	- name: Apply latest CRDs
	run: kubectl apply -f charts/workload-variant-autoscaler/crds/

	- name: Deploy WVA and llm-d infrastructure
	env:
	ENVIRONMENT: openshift
	INSTALL_GATEWAY_CTRLPLANE: "false"
	E2E_TESTS_ENABLED: "true"
	NAMESPACE_SCOPED: "false"
	LLMD_NS: ${{ env.LLMD_NAMESPACE }}
	WVA_NS: ${{ env.WVA_NAMESPACE }}
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	DEPLOY_VA: "false"
	DEPLOY_HPA: "false"
	DECODE_REPLICAS: "1"
	MONITORING_NAMESPACE: openshift-user-workload-monitoring
	WVA_METRICS_SECURE: "false"
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	VLLM_SVC_PORT: "8000"
	INSTALL_GRAFANA: "true"
	run: \|
	./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift

	- name: Label namespaces for OpenShift monitoring
	run: \|
	kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
	kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite

	- name: Wait for infrastructure to be ready
	run: \|
	kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s \|\| true
	kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s \|\| true
	echo "--- Services in openshift-user-workload-monitoring ---"
	kubectl get svc -n openshift-user-workload-monitoring
	echo "--- Services in openshift-monitoring ---"
	kubectl get svc -n openshift-monitoring

	- name: Run benchmark
	env:
	ENVIRONMENT: openshift
	USE_SIMULATOR: "false"
	SCALER_BACKEND: prometheus-adapter
	CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
	E2E_MONITORING_NAMESPACE: openshift-user-workload-monitoring
	E2E_EMULATED_LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
	BENCHMARK_GRAFANA_ENABLED: "true"
	BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
	BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
	BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	run: \|
	# Get token for Thanos querier
	export PROMETHEUS_TOKEN=$(kubectl create token prometheus-k8s -n openshift-monitoring --duration=24h 2>/dev/null \|\| echo "")

	# Start APIService guard: KEDA on this cluster continuously reclaims the
	# external.metrics.k8s.io APIService. This background loop re-patches it
	# every 8 seconds so the HPA can read wva_desired_replicas during the benchmark.
	# Key fix: caBundle must be set to null because KEDA sets it, and Kubernetes
	# rejects insecureSkipTLSVerify=true when caBundle is present.
	MONITORING_NS="openshift-user-workload-monitoring"
	(
	while true; do
	sleep 8
	current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.name}' 2>/dev/null)
	current_ns=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.namespace}' 2>/dev/null)
	if [ "$current_svc" != "prometheus-adapter" ] \|\| [ "$current_ns" != "$MONITORING_NS" ]; then
	echo "[apiservice-guard] KEDA reclaimed (now: $current_svc/$current_ns), re-patching..."
	kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{
	\"spec\": {
	\"caBundle\": null,
	\"insecureSkipTLSVerify\": true,
	\"service\": {
	\"name\": \"prometheus-adapter\",
	\"namespace\": \"$MONITORING_NS\"
	}
	}
	}" 2>&1 \|\| true
	fi
	done
	) &
	GUARD_PID=$!
	echo "APIService guard started (PID=$GUARD_PID)"

	# Give guard time to do initial patch if needed
	sleep 12
	echo "Checking external metrics API..."
	kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1" \| head -1 && echo "External metrics API: OK" \|\| echo "WARNING: External metrics API not available"

	TEST_EXIT=0
	make test-benchmark \|\| TEST_EXIT=$?

	kill $GUARD_PID 2>/dev/null \|\| true
	exit $TEST_EXIT

	- name: Generate benchmark plots
	if: always()
	run: \|
	echo "Installing matplotlib and numpy..."
	if python3 -m venv /tmp/plot-venv 2>&1; then
	/tmp/plot-venv/bin/pip install --quiet matplotlib numpy 2>&1
	PYTHON=/tmp/plot-venv/bin/python3
	else
	echo "venv failed, using PIP_BREAK_SYSTEM_PACKAGES fallback..."
	curl -sSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py
	PIP_BREAK_SYSTEM_PACKAGES=1 python3 /tmp/get-pip.py --user 2>&1
	PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip install --user matplotlib numpy 2>&1
	PYTHON=python3
	fi
	$PYTHON - <<'PLOTEOF'
	import json, os, sys
	try:
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker
	import numpy as np
	except ImportError:
	print("matplotlib not available, skipping plot generation")
	sys.exit(0)

	PANEL_DIR = '/tmp/benchmark-panels'
	PREFILL_FILE = '/tmp/prefill-benchmark-results.json'
	os.makedirs(PANEL_DIR, exist_ok=True)

	if not os.path.exists(PREFILL_FILE):
	print("No prefill results found, skipping plots")
	sys.exit(0)

	with open(PREFILL_FILE) as f:
	results = json.load(f)

	if not isinstance(results, list) or len(results) < 2:
	print("Need at least 2 results (HPA + WVA) for comparison plots")
	sys.exit(0)

	hpa = next((r for r in results if r['autoscaler_type'] == 'HPA'), None)
	wva = next((r for r in results if r['autoscaler_type'] == 'WVA'), None)
	if not hpa or not wva:
	print("Missing HPA or WVA results")
	sys.exit(0)

	plt.rcParams.update({
	'figure.facecolor': 'white', 'axes.facecolor': '#f8f9fa',
	'axes.grid': True, 'grid.alpha': 0.3, 'font.size': 12,
	'axes.titlesize': 14, 'axes.labelsize': 12, 'figure.dpi': 150,
	})
	HPA_C, WVA_C = '#e74c3c', '#2ecc71'
	EMPTY_METRIC = {'mean': 0, 'count': 0, 'percentiles': {k: 0 for k in ['p05','p10','p25','p50','p75','p90','p95','p99']}}

	def m(data, key):
	"""Safely get a metric dict, returning EMPTY_METRIC if absent."""
	v = data.get(key)
	if isinstance(v, dict):
	if 'percentiles' not in v:
	v['percentiles'] = EMPTY_METRIC['percentiles']
	return v
	return EMPTY_METRIC

	def bar_pair(ax, hv, wv, title, ylabel, fmt='.1f'):
	bars = ax.bar(['HPA', 'WVA'], [hv, wv], color=[HPA_C, WVA_C], width=0.5, edgecolor='white', linewidth=1.5)
	ax.set_title(title, fontweight='bold')
	ax.set_ylabel(ylabel)
	for bar, val in zip(bars, [hv, wv]):
	ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + ax.get_ylim()[1]*0.02,
	f'{val:{fmt}}', ha='center', va='bottom', fontweight='bold', fontsize=11)

	# 1. Comparison bars (3x3 grid)
	fig, axes = plt.subplots(3, 3, figsize=(20, 13))
	fig.suptitle('HPA vs WVA — Prefill-Heavy Workload (OpenShift)', fontsize=16, fontweight='bold', y=1.02)
	bar_pair(axes[0,0], m(hpa,'throughput')['mean'], m(wva,'throughput')['mean'], 'Mean Throughput', 'tokens/sec', '.0f')
	bar_pair(axes[0,1], m(hpa,'ttft')['count'], m(wva,'ttft')['count'], 'Completed Requests', 'count', 'd')
	bar_pair(axes[0,2], hpa['max_replicas'], wva['max_replicas'], 'Max Replicas', 'replicas', 'd')
	bar_pair(axes[1,0], hpa['avg_kv_cache'], wva['avg_kv_cache'], 'Avg KV Cache', 'utilization', '.3f')
	bar_pair(axes[1,1], hpa['avg_queue_depth'], wva['avg_queue_depth'], 'Avg vLLM Queue', 'requests', '.0f')
	bar_pair(axes[1,2], hpa.get('avg_epp_queue_depth',0), wva.get('avg_epp_queue_depth',0), 'Avg EPP Queue', 'requests', '.0f')
	bar_pair(axes[2,0], m(hpa,'itl')['mean'], m(wva,'itl')['mean'], 'Mean ITL', 'ms', '.2f')
	bar_pair(axes[2,1], m(hpa,'ttft')['mean']/1000, m(wva,'ttft')['mean']/1000, 'Mean TTFT', 'seconds', '.1f')
	bar_pair(axes[2,2], hpa['avg_replicas'], wva['avg_replicas'], 'Avg Replicas', 'replicas', '.1f')
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-comparison.png'), bbox_inches='tight', dpi=150)
	plt.close()

	# 2. Replica timeline overlay
	fig, ax = plt.subplots(figsize=(14, 5))
	for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]:
	tl = data['replica_timeline']
	times = [s['elapsed_sec'] for s in tl]
	ready = [s['ready_replicas'] for s in tl]
	ax.step(times, ready, where='post', label=f'{label} (ready)', color=color, linewidth=2.5)
	ax.fill_between(times, ready, step='post', alpha=0.1, color=color)
	ax.set_title('Ready Replicas Over Time — HPA vs WVA', fontsize=14, fontweight='bold')
	ax.set_xlabel('Time (seconds)')
	ax.set_ylabel('Ready Replicas')
	ax.legend(fontsize=12)
	ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
	ax.set_ylim(0, max(wva['max_replicas'], hpa['max_replicas']) + 1)
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-replica-timeline.png'), bbox_inches='tight', dpi=150)
	plt.close()

	# 3. Queue depth (vLLM + EPP) + KV cache over time
	fig, (ax_qd, ax_epp, ax_kv) = plt.subplots(3, 1, figsize=(14, 11), sharex=True)
	fig.suptitle('Queue Depth & KV Cache Over Time — HPA vs WVA', fontsize=15, fontweight='bold')
	for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]:
	mt = data.get('metrics_timeline', [])
	if mt:
	times = [s['elapsed_sec'] for s in mt]
	qd = [s['queue_depth'] for s in mt]
	epp_qd = [s.get('epp_queue_depth', 0) for s in mt]
	kv = [s['kv_cache'] for s in mt]
	ax_qd.plot(times, qd, label=label, color=color, linewidth=2, alpha=0.85)
	ax_qd.fill_between(times, qd, alpha=0.1, color=color)
	ax_epp.plot(times, epp_qd, label=label, color=color, linewidth=2, alpha=0.85)
	ax_epp.fill_between(times, epp_qd, alpha=0.1, color=color)
	ax_kv.plot(times, kv, label=label, color=color, linewidth=2, alpha=0.85)
	ax_kv.fill_between(times, kv, alpha=0.1, color=color)
	ax_qd.set_title('vLLM Queue Depth (vllm:num_requests_waiting)', fontweight='bold')
	ax_qd.set_ylabel('Waiting Requests')
	ax_qd.legend(fontsize=11)
	ax_epp.set_title('EPP Queue Depth (inference_extension_flow_control_queue_size)', fontweight='bold')
	ax_epp.set_ylabel('Queued Requests')
	ax_epp.legend(fontsize=11)
	ax_kv.set_title('KV Cache Utilization (vllm:kv_cache_usage_perc)', fontweight='bold')
	ax_kv.set_ylabel('Utilization')
	ax_kv.set_xlabel('Time (seconds)')
	ax_kv.legend(fontsize=11)
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-metrics-timeline.png'), bbox_inches='tight', dpi=150)
	plt.close()

	# 4. Throughput percentile distribution
	pct_keys = ['p05', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99']
	pct_labels = ['p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99']
	x = np.arange(len(pct_labels))
	w = 0.35
	fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))
	fig.suptitle('Percentile Distributions — HPA vs WVA', fontsize=15, fontweight='bold')
	for ax, metric, unit, div in [(ax1,'ttft','seconds',1000),(ax2,'itl','ms',1),(ax3,'throughput','tok/s',1)]:
	hv = [m(hpa, metric)['percentiles'].get(k, 0)/div for k in pct_keys]
	wv = [m(wva, metric)['percentiles'].get(k, 0)/div for k in pct_keys]
	ax.bar(x - w/2, hv, w, label='HPA', color=HPA_C, alpha=0.85)
	ax.bar(x + w/2, wv, w, label='WVA', color=WVA_C, alpha=0.85)
	ax.set_xticks(x); ax.set_xticklabels(pct_labels, fontsize=9)
	ax.set_title(f'{metric.upper()} Percentiles', fontweight='bold')
	ax.set_ylabel(unit); ax.legend(fontsize=9)
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-percentiles.png'), bbox_inches='tight', dpi=150)
	plt.close()

	print(f"Generated benchmark plots in {PANEL_DIR}")
	for f in sorted(os.listdir(PANEL_DIR)):
	if f.endswith('.png'):
	print(f" {f}")

	# --- Generate per-autoscaler PDF reports (colleague format) ---
	from matplotlib.backends.backend_pdf import PdfPages
	import textwrap

	model_id = os.environ.get('MODEL_ID', 'unknown')
	model_short = model_id.split('/')[-1]

	for data in [hpa, wva]:
	atype = data['autoscaler_type']
	pdf_path = os.path.join(PANEL_DIR, f'report_{atype.lower()}_{model_short}.pdf')
	with PdfPages(pdf_path) as pdf:
	# Page 1: Configuration & Summary
	fig, ax = plt.subplots(figsize=(11, 8.5))
	ax.axis('off')
	lines = []
	sep = '='*80
	dash = '-'*80

	va_cfg = data.get('va_config', 'N/A')
	hpa_cfg = data.get('hpa_config', 'N/A')
	data_model = data.get('model_id', model_id)

	if atype == 'WVA':
	atype_label = 'Workload Variant Autoscaler (WVA)'
	else:
	atype_label = 'HPA Baseline (VA-constrained + HPA)'

	lines.append(sep)
	lines.append(f"AUTOSCALER TYPE : {atype_label}")
	lines.append(f"MODEL : {data_model}")
	lines.append(sep)
	lines.append(f"Autoscaler Configuration")
	lines.append(dash)
	lines.append(f" Variant (VA) : {va_cfg}")
	lines.append(f" HPA : {hpa_cfg}")
	lines.append(sep)
	lines.append(f"Benchmark Load Generator Configuration")
	lines.append(dash)
	lines.append(f" Profile : poisson @ 20 req/s")
	lines.append(f" Prompt tokens : 4000 \| Output tokens: 1000")
	lines.append(f" Max seconds : 600 \| Seed: 42")
	lines.append(sep)
	lines.append(f"EPP Configuration")
	lines.append(dash)
	lines.append(f" Flow Control : ENABLED")
	lines.append(f" Scorers : queue-scorer=2, kv-cache-utilization-scorer=2, prefix-cache-scorer=3")
	lines.append(sep)
	lines.append(f"Results Summary")
	lines.append(dash)

	tp_obj = data.get('throughput', {})
	ttft_obj = data.get('ttft', {})
	itl_obj = data.get('itl', {})
	tp_mean = tp_obj.get('mean', 0) if isinstance(tp_obj, dict) else 0
	ttft_mean = ttft_obj.get('mean', 0) if isinstance(ttft_obj, dict) else 0
	ttft_p50 = ttft_obj.get('percentiles', {}).get('p50', 0) if isinstance(ttft_obj, dict) else 0
	ttft_p99 = ttft_obj.get('percentiles', {}).get('p99', 0) if isinstance(ttft_obj, dict) else 0
	itl_mean = itl_obj.get('mean', 0) if isinstance(itl_obj, dict) else 0
	itl_p50 = itl_obj.get('percentiles', {}).get('p50', 0) if isinstance(itl_obj, dict) else 0
	itl_p99 = itl_obj.get('percentiles', {}).get('p99', 0) if isinstance(itl_obj, dict) else 0
	completed = ttft_obj.get('count', 0) if isinstance(ttft_obj, dict) else 0
	error_count = data.get('error_count', 0)
	incomplete_count = data.get('incomplete_count', 0)
	achieved_rps = data.get('achieved_rps', 0)

	lines.append(f" Completed Requests : {completed}")
	lines.append(f" Failed Requests : {error_count}")
	lines.append(f" Incomplete Requests : {incomplete_count}")
	lines.append(f" Achieved RPS : {achieved_rps:.2f}")
	lines.append(f" Throughput (mean) : {tp_mean:.1f} tok/s")
	lines.append(f" Max Replicas : {data['max_replicas']}")
	lines.append(f" Avg Replicas : {data['avg_replicas']:.2f}")
	lines.append(f" Avg vLLM Queue : {data['avg_queue_depth']:.1f}")
	lines.append(f" Avg EPP Queue : {data.get('avg_epp_queue_depth', 0):.1f}")
	lines.append(f" Avg KV Cache : {data['avg_kv_cache']*100:.2f}%")
	lines.append(dash)
	lines.append(f" TTFT mean={ttft_mean/1000:.2f}s p50={ttft_p50/1000:.2f}s p99={ttft_p99/1000:.2f}s")
	lines.append(f" ITL mean={itl_mean:.2f}ms p50={itl_p50:.2f}ms p99={itl_p99:.2f}ms")
	lines.append(f" Duration: {data['duration_sec']:.0f}s")
	lines.append(sep)

	ax.text(0.05, 0.95, '\n'.join(lines), transform=ax.transAxes, fontsize=8.5,
	verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
	fig.suptitle(f'{atype} Benchmark Report — {model_short}', fontsize=14, fontweight='bold')
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# Page 2: Time-series charts (KV Cache, Queue, Replicas, EPP Queue)
	mt = data.get('metrics_timeline', [])
	tl = data.get('replica_timeline', [])
	if mt and tl:
	fig, axes = plt.subplots(4, 1, figsize=(11, 14), sharex=True)
	fig.suptitle(f'{atype} — Metrics Over Time ({model_short})', fontsize=14, fontweight='bold')
	color = WVA_C if atype == 'WVA' else HPA_C
	times_m = [s['elapsed_sec'] for s in mt]
	kv = [s['kv_cache']*100 for s in mt]
	qd = [s['queue_depth'] for s in mt]
	epp = [s.get('epp_queue_depth', 0) for s in mt]
	times_r = [s['elapsed_sec'] for s in tl]
	ready = [s['ready_replicas'] for s in tl]

	axes[0].plot(times_m, kv, color=color, linewidth=2)
	axes[0].fill_between(times_m, kv, alpha=0.15, color=color)
	axes[0].set_ylabel('KV Cache Usage (%)')
	axes[0].set_title('KV Cache Usage Over Time')

	axes[1].plot(times_m, qd, color=color, linewidth=2)
	axes[1].fill_between(times_m, qd, alpha=0.15, color=color)
	axes[1].set_ylabel('Requests Waiting')
	axes[1].set_title('Number of Requests Waiting Over Time')

	axes[2].step(times_r, ready, where='post', color=color, linewidth=2.5, label='Actual Replicas')
	axes[2].fill_between(times_r, ready, step='post', alpha=0.1, color=color)
	ax2b = axes[2].twinx()
	ax2b.plot(times_m, epp, color='#3498db', linewidth=1.5, alpha=0.7, label='EPP Queue')
	ax2b.set_ylabel('EPP Queue Size', color='#3498db')
	axes[2].set_ylabel('Replica Count')
	axes[2].set_title('Replica Count & EPP Queue Over Time')
	axes[2].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
	axes[2].legend(loc='upper left', fontsize=9)
	ax2b.legend(loc='upper right', fontsize=9)

	axes[3].plot(times_m, epp, color='#3498db', linewidth=2)
	axes[3].fill_between(times_m, epp, alpha=0.15, color='#3498db')
	axes[3].set_ylabel('EPP Queue Size')
	axes[3].set_xlabel('Time (seconds)')
	axes[3].set_title('EPP Flow Control Queue Size Over Time')

	fig.tight_layout()
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# Page 3: GuideLLM metrics (TTFT, ITL, Throughput distributions)
	has_pcts = isinstance(ttft_obj, dict) and 'percentiles' in ttft_obj
	if has_pcts:
	fig, axes = plt.subplots(2, 2, figsize=(11, 8.5))
	fig.suptitle(f'{atype} — GuideLLM Latency & Throughput ({model_short})', fontsize=14, fontweight='bold')
	pct_keys = ['p05','p10','p25','p50','p75','p90','p95','p99']
	pct_labels = ['p5','p10','p25','p50','p75','p90','p95','p99']
	x = np.arange(len(pct_labels))

	ttft_vals = [ttft_obj.get('percentiles', {}).get(k, 0)/1000 for k in pct_keys]
	axes[0,0].bar(x, ttft_vals, color=color, alpha=0.85)
	axes[0,0].set_xticks(x); axes[0,0].set_xticklabels(pct_labels, fontsize=8)
	axes[0,0].set_title('TTFT Percentiles', fontweight='bold')
	axes[0,0].set_ylabel('seconds')
	axes[0,0].axhline(y=ttft_mean/1000, color='red', linestyle='--', label=f'mean={ttft_mean/1000:.1f}s')
	axes[0,0].legend(fontsize=8)

	itl_vals = [itl_obj.get('percentiles', {}).get(k, 0) for k in pct_keys]
	axes[0,1].bar(x, itl_vals, color=color, alpha=0.85)
	axes[0,1].set_xticks(x); axes[0,1].set_xticklabels(pct_labels, fontsize=8)
	axes[0,1].set_title('ITL Percentiles', fontweight='bold')
	axes[0,1].set_ylabel('ms')
	axes[0,1].axhline(y=itl_mean, color='red', linestyle='--', label=f'mean={itl_mean:.2f}ms')
	axes[0,1].legend(fontsize=8)

	tp_vals = [tp_obj.get('percentiles', {}).get(k, 0) for k in pct_keys]
	axes[1,0].bar(x, tp_vals, color=color, alpha=0.85)
	axes[1,0].set_xticks(x); axes[1,0].set_xticklabels(pct_labels, fontsize=8)
	axes[1,0].set_title('Throughput Percentiles', fontweight='bold')
	axes[1,0].set_ylabel('tok/s')
	axes[1,0].axhline(y=tp_mean, color='red', linestyle='--', label=f'mean={tp_mean:.0f}')
	axes[1,0].legend(fontsize=8)

	axes[1,1].axis('off')
	summary_lines = [
	f"Completed : {completed}",
	f"Failed : {error_count}",
	f"Incomplete: {incomplete_count}",
	f"RPS : {achieved_rps:.2f}",
	f"",
	f"Throughput: {tp_mean:.0f} tok/s",
	f"TTFT mean : {ttft_mean/1000:.2f}s",
	f"ITL mean : {itl_mean:.2f}ms",
	f"",
	f"Avg Replicas: {data['avg_replicas']:.2f}",
	f"Max Replicas: {data['max_replicas']}",
	f"Avg KV Cache: {data['avg_kv_cache']*100:.2f}%",
	]
	axes[1,1].text(0.1, 0.85, '\n'.join(summary_lines), transform=axes[1,1].transAxes,
	fontsize=11, verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
	axes[1,1].set_title('Summary', fontweight='bold')

	fig.tight_layout()
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	print(f" Generated PDF report: {pdf_path}")

	print(f"Generated benchmark plots in {PANEL_DIR}")
	for f in sorted(os.listdir(PANEL_DIR)):
	if f.endswith('.png') or f.endswith('.pdf'):
	print(f" {f}")
	PLOTEOF

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results-openshift
	path: \|
	/tmp/benchmark-results.json
	/tmp/prefill-benchmark-results.json
	/tmp/benchmark-grafana-snapshot.txt
	/tmp/benchmark-grafana-snapshot.json
	/tmp/benchmark-panels/
	if-no-files-found: warn

	- name: Post benchmark results as PR comment
	if: always() && (github.event_name == 'issue_comment' \|\| needs.gate.outputs.pr_number != '')
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');
	const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
	const sha = '${{ needs.gate.outputs.pr_head_sha }}';
	const runId = context.runId;
	const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;

	let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
	try {
	const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
	owner: context.repo.owner,
	repo: context.repo.repo,
	run_id: runId
	});
	const benchArtifact = artifacts.find(a => a.name === 'benchmark-results-openshift');
	if (benchArtifact) {
	artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
	}
	} catch (e) {
	console.log(`Could not look up artifact: ${e.message}`);
	}

	let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';
	try {
	const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));
	const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;
	resultsTable = `\| Metric \| Value \|
	\|--------\|-------\|
	\| Scale-up time \| ${fmtTime(data.scaleUpTimeSec)} \|
	\| Scale-down time \| ${fmtTime(data.scaleDownTimeSec)} \|
	\| Max replicas \| ${data.maxReplicas} \|
	\| Avg KV cache usage \| ${data.avgKVCacheUsage.toFixed(3)} \|
	\| Avg queue depth \| ${data.avgQueueDepth.toFixed(1)} \|
	\| Replica oscillation (σ) \| ${data.replicaOscillation.toFixed(2)} \|
	\| Total duration \| ${data.totalDurationSec.toFixed(0)}s \|`;
	} catch (e) {
	console.log(`Could not read results: ${e.message}`);
	}

	let prefillSection = '';
	try {
	const prefillData = JSON.parse(fs.readFileSync('/tmp/prefill-benchmark-results.json', 'utf8'));
	if (Array.isArray(prefillData) && prefillData.length >= 2) {
	const hpa = prefillData.find(r => r.autoscaler_type === 'HPA');
	const wva = prefillData.find(r => r.autoscaler_type === 'WVA');
	if (hpa && wva) {
	const delta = (h, w, lowerBetter) => {
	if (h === 0) return '—';
	const pct = ((w - h) / Math.abs(h)) * 100;
	const arrow = pct < 0 ? '↓' : '↑';
	const sign = pct > 0 ? '+' : '';
	return `${sign}${pct.toFixed(1)}% ${arrow}`;
	};
	const fmtP = (obj, key, div=1) => obj && obj.percentiles ? (obj.percentiles[key]/div).toFixed(1) : 'N/A';
	const fmtM = (obj, div=1, prec=1) => obj ? (obj.mean/div).toFixed(prec) : 'N/A';

	let table = `\| Metric \| HPA (Baseline) \| WVA \| Δ \|
	\|--------\|---------------\|-----\|---\|
	\| Max Replicas \| ${hpa.max_replicas} \| ${wva.max_replicas} \| ${delta(hpa.max_replicas, wva.max_replicas)} \|
	\| Avg Replicas \| ${hpa.avg_replicas.toFixed(2)} \| ${wva.avg_replicas.toFixed(2)} \| ${delta(hpa.avg_replicas, wva.avg_replicas)} \|
	\| Avg vLLM Queue Depth \| ${hpa.avg_queue_depth.toFixed(1)} \| ${wva.avg_queue_depth.toFixed(1)} \| ${delta(hpa.avg_queue_depth, wva.avg_queue_depth)} \|
	\| Avg EPP Queue Depth \| ${(hpa.avg_epp_queue_depth\|\|0).toFixed(1)} \| ${(wva.avg_epp_queue_depth\|\|0).toFixed(1)} \| ${delta(hpa.avg_epp_queue_depth\|\|0, wva.avg_epp_queue_depth\|\|0)} \|
	\| Avg KV Cache \| ${hpa.avg_kv_cache.toFixed(3)} \| ${wva.avg_kv_cache.toFixed(3)} \| ${delta(hpa.avg_kv_cache, wva.avg_kv_cache)} \|
	\| TTFT mean \| ${fmtM(hpa.ttft, 1000)}s \| ${fmtM(wva.ttft, 1000)}s \| ${hpa.ttft && wva.ttft ? delta(hpa.ttft.mean, wva.ttft.mean) : '—'} \|
	\| TTFT p50 \| ${fmtP(hpa.ttft, 'p50', 1000)}s \| ${fmtP(wva.ttft, 'p50', 1000)}s \| — \|
	\| TTFT p99 \| ${fmtP(hpa.ttft, 'p99', 1000)}s \| ${fmtP(wva.ttft, 'p99', 1000)}s \| — \|
	\| ITL mean \| ${fmtM(hpa.itl, 1, 2)}ms \| ${fmtM(wva.itl, 1, 2)}ms \| ${hpa.itl && wva.itl ? delta(hpa.itl.mean, wva.itl.mean) : '—'} \|
	\| Throughput mean \| ${fmtM(hpa.throughput)}tok/s \| ${fmtM(wva.throughput)}tok/s \| ${hpa.throughput && wva.throughput ? delta(hpa.throughput.mean, wva.throughput.mean) : '—'} \|
	\| Throughput p50 \| ${fmtP(hpa.throughput, 'p50')}tok/s \| ${fmtP(wva.throughput, 'p50')}tok/s \| — \|
	\| Completed Requests \| ${hpa.ttft ? hpa.ttft.count : 'N/A'} \| ${wva.ttft ? wva.ttft.count : 'N/A'} \| ${hpa.ttft && wva.ttft ? delta(hpa.ttft.count, wva.ttft.count) : '—'} \|
	\| Failed Requests \| ${hpa.error_count \|\| 0} \| ${wva.error_count \|\| 0} \| — \|
	\| Incomplete Requests \| ${hpa.incomplete_count \|\| 0} \| ${wva.incomplete_count \|\| 0} \| — \|
	\| Achieved RPS \| ${(hpa.achieved_rps \|\| 0).toFixed(2)} \| ${(wva.achieved_rps \|\| 0).toFixed(2)} \| — \|
	\| Duration \| ${hpa.duration_sec.toFixed(0)}s \| ${wva.duration_sec.toFixed(0)}s \| — \|`;

	let timelines = '';
	for (const r of [hpa, wva]) {
	if (r.replica_timeline && r.replica_timeline.length > 0) {
	timelines += `\n<details>\n<summary>${r.autoscaler_type} Replica Timeline (${r.replica_timeline.length} snapshots)</summary>\n\n\| Time (s) \| Spec \| Ready \|\n\|----------\|------\|-------\|\n`;
	for (const s of r.replica_timeline) {
	timelines += `\| ${s.elapsed_sec.toFixed(0)} \| ${s.spec_replicas} \| ${s.ready_replicas} \|\n`;
	}
	timelines += `\n</details>\n`;
	}
	}

	prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n\n${table}\n${timelines}`;
	}
	} else if (Array.isArray(prefillData) && prefillData.length > 0) {
	let rows = '';
	for (const r of prefillData) {
	rows += `\n### ${r.autoscaler_type}\n\n\| Metric \| Value \|\n\|--------\|-------\|\n\| Duration \| ${r.duration_sec.toFixed(0)}s \|\n\| Max Replicas \| ${r.max_replicas} \|\n\| Avg Replicas \| ${r.avg_replicas.toFixed(2)} \|\n\| Avg vLLM Queue Depth \| ${r.avg_queue_depth.toFixed(2)} \|\n\| Avg EPP Queue Depth \| ${(r.avg_epp_queue_depth\|\|0).toFixed(2)} \|\n\| Avg KV Cache \| ${r.avg_kv_cache.toFixed(3)} \|\n`;
	}
	prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n${rows}`;
	}
	} catch (e) {
	console.log(`Could not read prefill results: ${e.message}`);
	}

	let panelImages = '';
	const panelDir = '/tmp/benchmark-panels';
	const hasPanels = fs.existsSync(panelDir) && fs.readdirSync(panelDir).some(f => f.endsWith('.png'));

	if (hasPanels) {
	const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
	const tag = `benchmark-run-os-${runId}`;
	try {
	const release = await github.rest.repos.createRelease({
	owner: context.repo.owner,
	repo: context.repo.repo,
	tag_name: tag,
	name: `Benchmark panels OpenShift (PR #${prNumber}, ${sha.substring(0, 7)})`,
	body: `Auto-generated by benchmark CI run #${runId}`,
	draft: false,
	prerelease: true
	});

	const imageUrls = [];
	for (const png of pngs) {
	const filePath = path.join(panelDir, png);
	const fileData = fs.readFileSync(filePath);
	const asset = await github.rest.repos.uploadReleaseAsset({
	owner: context.repo.owner,
	repo: context.repo.repo,
	release_id: release.data.id,
	name: png,
	data: fileData,
	headers: { 'content-type': 'image/png' }
	});
	const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
	imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
	}

	if (imageUrls.length > 0) {
	panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
	}
	} catch (e) {
	console.log(`Could not upload panel images: ${e.message}`);
	}
	}

	const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
	let artifactsSection = '';
	if (hasSnapshotJson \|\| hasPanels) {
	const items = [];
	if (hasSnapshotJson) items.push('Grafana snapshot JSON');
	artifactsSection = `\n\n📎 [Download artifacts](${artifactUrl})${items.length ? ' — ' + items.join(', ') : ''}`;
	}

	const body = `## Benchmark: scale-up-latency (OpenShift)

	${resultsTable}${prefillSection}${panelImages}${artifactsSection}

	<details>
	<summary>Environment</summary>

	- Cluster: OpenShift (Real GPUs)
	- Model: ${process.env.MODEL_ID \|\| 'unsloth/Meta-Llama-3.1-8B'}
	- Accelerator: H100
	- Commit: ${sha.substring(0, 7)}
	- Scaler: prometheus-adapter
	- [Workflow run](${repoUrl}/actions/runs/${runId})

	</details>`;

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});

	- name: Cleanup infrastructure
	if: always()
	run: \|
	helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
	helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	done
	kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s \|\| true
	kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s \|\| true

	report-status:
	runs-on: ubuntu-latest
	needs: [gate, benchmark-kind, benchmark-openshift]
	if: always() && needs.gate.outputs.run_benchmark == 'true'
	permissions:
	statuses: write
	steps:
	- name: Report status to PR
	uses: actions/github-script@v7
	with:
	script: \|
	const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
	const platform = '${{ needs.gate.outputs.platform }}';

	let benchResult;
	if (platform === 'openshift') {
	benchResult = '${{ needs.benchmark-openshift.result }}';
	} else {
	benchResult = '${{ needs.benchmark-kind.result }}';
	}

	if (!prHeadSha) {
	console.log('No PR head SHA available, skipping status report');
	return;
	}

	let state, description;
	if (benchResult === 'success') {
	state = 'success';
	description = 'Benchmark completed successfully';
	} else if (benchResult === 'skipped') {
	state = 'failure';
	description = 'Benchmark did not run (prerequisite failed or skipped)';
	} else if (benchResult === 'cancelled') {
	state = 'failure';
	description = 'Benchmark cancelled';
	} else {
	state = 'failure';
	description = 'Benchmark failed';
	}

	console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);

	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: prHeadSha,
	state: state,
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: description,
	context: '${{ github.workflow }} / benchmark-kind'
	});

	console.log('Status reported successfully');

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219

Workflow file

Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219

Uh oh!

Workflow file for this run