Benchmark: PR #1010 | ## Investigation Summary: `/benchmark openshift` Gateway 500 Failure Problem: The Gateway connectivity check always fails with HTTP 500 from `istio-envoy` (empty body). Root cause: The `llm-d-infra` chart (v1.4.0) creates the Gateway with `istio.io/enable-inference-extproc: "true"`, which requires Istio to natively support InferencePool-based ext_proc routing. The Istio/OSSM version on the CI OpenShift cluster doesn't appear to support this feature. What was tried: 1.... #357

Workflow file for this run

.github/workflows/ci-benchmark.yaml at 036dc2a

	name: CI - Benchmark
	run-name: >-
	${{ github.event_name == 'workflow_dispatch'
	&& format('Benchmark: {0} \| {1} \| {2}',
	inputs.platform,
	inputs.model_id \|\| 'unsloth/Meta-Llama-3.1-8B',
	github.ref_name)
	\|\| format('Benchmark: PR #{0} \| {1}',
	github.event.issue.number,
	github.event.comment.body) }}

	concurrency:
	group: >-
	${{
	github.event_name == 'issue_comment' &&
	!contains(github.event.comment.body, '/benchmark kind') &&
	!contains(github.event.comment.body, '/benchmark openshift')
	&& format('benchmark-isolated-{0}', github.run_id)
	\|\| format('benchmark-{0}',
	github.event.issue.number
	\|\| github.run_id)
	}}
	cancel-in-progress: true

	on:
	issue_comment:
	types: [created]
	workflow_dispatch:
	inputs:
	platform:
	description: 'Platform: kind or openshift'
	required: true
	default: 'kind'
	type: choice
	options: [kind, openshift]
	model_id:
	description: 'Model to benchmark (HuggingFace ID)'
	required: false
	default: 'unsloth/Meta-Llama-3.1-8B'
	type: string

	jobs:
	gate:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	pull-requests: write
	outputs:
	run_benchmark: ${{ steps.check.outputs.run_benchmark }}
	platform: ${{ steps.check.outputs.platform }}
	pr_number: ${{ steps.check.outputs.pr_number }}
	pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
	pr_head_repo: ${{ steps.check.outputs.pr_head_repo }}
	steps:
	- name: Check if benchmark requested
	id: check
	uses: actions/github-script@v7
	with:
	script: \|
	async function hasWriteAccess(username) {
	try {
	const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	username: username
	});
	const privilegedRoles = ['admin', 'maintain', 'write'];
	return privilegedRoles.includes(permission.permission);
	} catch (e) {
	console.log(`Could not get permissions for ${username}: ${e.message}`);
	return false;
	}
	}

	if (context.eventName !== 'issue_comment' && context.eventName !== 'workflow_dispatch') {
	core.setOutput('run_benchmark', 'false');
	return;
	}

	if (context.eventName === 'workflow_dispatch') {
	const platform = context.payload.inputs.platform;
	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	console.log(`Manual benchmark dispatch for ${platform}`);

	core.setOutput('run_benchmark', 'true');
	core.setOutput('platform', platform);

	// Try to find a PR for the current branch so we can post results
	const branch = context.ref.replace('refs/heads/', '');
	const { data: prs } = await github.rest.pulls.list({
	owner: context.repo.owner,
	repo: context.repo.repo,
	head: `${context.repo.owner}:${branch}`,
	state: 'open',
	});
	if (prs.length > 0) {
	core.setOutput('pr_number', prs[0].number.toString());
	core.setOutput('pr_head_sha', prs[0].head.sha);
	console.log(`Found open PR #${prs[0].number} for branch ${branch}`);
	} else {
	console.log(`No open PR found for branch ${branch}, skipping PR outputs`);
	}
	return;
	}

	const comment = context.payload.comment.body.trim();
	const issue = context.payload.issue;

	if (!issue.pull_request) {
	console.log('Comment is not on a PR, skipping');
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const validCommands = ['/benchmark kind', '/benchmark openshift'];
	if (!validCommands.includes(comment)) {
	console.log(`Comment "${comment}" is not a valid benchmark command, skipping`);
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const commenter = context.payload.comment.user.login;
	const hasAccess = await hasWriteAccess(commenter);
	if (!hasAccess) {
	console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
	core.setOutput('run_benchmark', 'false');
	return;
	}

	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: issue.number
	});

	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;

	console.log(`/benchmark kind approved by ${commenter} for PR #${issue.number}`);
	console.log(`PR head SHA: ${pr.head.sha}`);

	await github.rest.reactions.createForIssueComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: context.payload.comment.id,
	content: 'rocket'
	});

	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	const platform = comment.includes('openshift') ? 'OpenShift' : 'Kind';
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: issue.number,
	body: `🚀 Benchmark (${platform}) triggered by \`${comment}\`\n\n[View the benchmark workflow run](${runUrl})`
	});

	core.setOutput('run_benchmark', 'true');
	core.setOutput('platform', platform.toLowerCase());
	core.setOutput('pr_number', issue.number.toString());
	core.setOutput('pr_head_sha', pr.head.sha);
	core.setOutput('pr_head_repo', headRepo);

	build-image:
	needs: gate
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' \|\| github.event.inputs.platform == 'openshift')
	runs-on: ubuntu-latest
	outputs:
	image_tag: ${{ steps.build.outputs.image_tag }}
	steps:
	- name: Checkout source
	uses: actions/checkout@v4
	with:
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Log in to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ secrets.CR_USER }}
	password: ${{ secrets.CR_TOKEN }}

	- name: Build and push image
	id: build
	env:
	REGISTRY: ghcr.io
	IMAGE_NAME: ${{ github.repository }}
	GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	IMAGE_TAG="bench-$(printf '%s' "$GIT_REF" \| cut -c1-8)"
	FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
	echo "Building image: $FULL_IMAGE"

	make docker-build IMG="$FULL_IMAGE"
	make docker-push IMG="$FULL_IMAGE"

	echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT

	benchmark-kind:
	runs-on: ubuntu-latest
	needs: [gate]
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'kind' \|\| github.event.inputs.platform == 'kind')
	timeout-minutes: 45
	permissions:
	contents: write
	statuses: write
	pull-requests: write
	actions: read
	steps:
	- name: Set pending status on PR head
	if: github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: '${{ needs.gate.outputs.pr_head_sha }}',
	state: 'pending',
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: 'Benchmark running...',
	context: '${{ github.workflow }} / benchmark-kind'
	});

	- name: Validate PR head SHA
	if: github.event_name == 'issue_comment'
	run: \|
	if [ -z "${{ needs.gate.outputs.pr_head_sha }}" ]; then
	echo "::error::pr_head_sha is empty — refusing to fall back to main"
	exit 1
	fi
	echo "Checkout will use PR head SHA: ${{ needs.gate.outputs.pr_head_sha }}"

	- name: Checkout source
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.gate.outputs.pr_head_repo \|\| github.repository }}
	ref: ${{ needs.gate.outputs.pr_head_sha \|\| github.sha }}
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Extract Go version from go.mod
	run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV

	- name: Set up Go with cache
	uses: actions/setup-go@v6
	with:
	go-version: "${{ env.GO_VERSION }}"
	cache-dependency-path: ./go.sum

	- name: Install dependencies
	run: go mod download

	- name: Install Kind
	run: \|
	ARCH=$(uname -m)
	case "$ARCH" in
	x86_64) KIND_ARCH="amd64" ;;
	aarch64) KIND_ARCH="arm64" ;;
	*) echo "Unsupported architecture: $ARCH"; exit 1 ;;
	esac
	curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.25.0/kind-linux-${KIND_ARCH}"
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind
	kind version

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Build WVA image locally
	id: build-image
	env:
	CHECKOUT_SHA: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	IMAGE_NAME="llm-d-workload-variant-autoscaler"
	IMAGE_TAG="bench-${CHECKOUT_SHA:0:7}"
	FULL_IMAGE="localhost/${IMAGE_NAME}:${IMAGE_TAG}"
	echo "Building local image: $FULL_IMAGE"
	make docker-build IMG="$FULL_IMAGE"
	echo "image=$FULL_IMAGE" >> $GITHUB_OUTPUT

	- name: Deploy e2e infrastructure
	env:
	ENVIRONMENT: kind-emulator
	USE_SIMULATOR: "true"
	CREATE_CLUSTER: "true"
	INSTALL_GATEWAY_CTRLPLANE: "true"
	E2E_TESTS_ENABLED: "true"
	IMG: ${{ steps.build-image.outputs.image }}
	SKIP_BUILD: "true"
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	INSTALL_GRAFANA: "true"
	run: make deploy-e2e-infra

	- name: Run benchmark
	env:
	ENVIRONMENT: kind-emulator
	USE_SIMULATOR: "true"
	SCALER_BACKEND: prometheus-adapter
	BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
	BENCHMARK_GRAFANA_ENABLED: "true"
	BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
	BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
	BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
	KV_SPARE_TRIGGER: "0.1"
	QUEUE_SPARE_TRIGGER: "3"
	run: make test-benchmark

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: \|
	/tmp/benchmark-results.json
	/tmp/prefill-benchmark-results.json
	/tmp/benchmark-grafana-snapshot.txt
	/tmp/benchmark-grafana-snapshot.json
	/tmp/benchmark-panels/
	if-no-files-found: warn

	- name: Post benchmark results as PR comment
	if: always() && github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');
	const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
	const sha = '${{ needs.gate.outputs.pr_head_sha }}';
	const runId = context.runId;
	const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;

	// Look up the uploaded artifact to get a direct download link
	let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
	try {
	const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
	owner: context.repo.owner,
	repo: context.repo.repo,
	run_id: runId
	});
	const benchArtifact = artifacts.find(a => a.name === 'benchmark-results');
	if (benchArtifact) {
	artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
	}
	} catch (e) {
	console.log(`Could not look up artifact: ${e.message}`);
	}

	let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';

	try {
	const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));

	const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;

	resultsTable = `\| Metric \| Value \|
	\|--------\|-------\|
	\| Scale-up time \| ${fmtTime(data.scaleUpTimeSec)} \|
	\| Scale-down time \| ${fmtTime(data.scaleDownTimeSec)} \|
	\| Max replicas \| ${data.maxReplicas} \|
	\| Avg KV cache usage \| ${data.avgKVCacheUsage.toFixed(3)} \|
	\| Avg queue depth \| ${data.avgQueueDepth.toFixed(1)} \|
	\| Replica oscillation (σ) \| ${data.replicaOscillation.toFixed(2)} \|
	\| Total duration \| ${data.totalDurationSec.toFixed(0)}s \|`;
	} catch (e) {
	console.log(`Could not read results: ${e.message}`);
	}

	// Upload panel PNGs as release assets and collect URLs for embedding
	let panelImages = '';
	const panelDir = '/tmp/benchmark-panels';
	const hasPanels = fs.existsSync(panelDir) &&
	fs.readdirSync(panelDir).some(f => f.endsWith('.png'));

	if (hasPanels) {
	const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
	const tag = `benchmark-run-${runId}`;

	try {
	// Create a lightweight release to host panel images
	const release = await github.rest.repos.createRelease({
	owner: context.repo.owner,
	repo: context.repo.repo,
	tag_name: tag,
	name: `Benchmark panels (PR #${prNumber}, ${sha.substring(0, 7)})`,
	body: `Auto-generated by benchmark CI run #${runId}`,
	draft: false,
	prerelease: true
	});

	const imageUrls = [];
	for (const png of pngs) {
	const filePath = path.join(panelDir, png);
	const fileData = fs.readFileSync(filePath);
	const asset = await github.rest.repos.uploadReleaseAsset({
	owner: context.repo.owner,
	repo: context.repo.repo,
	release_id: release.data.id,
	name: png,
	data: fileData,
	headers: { 'content-type': 'image/png' }
	});
	const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
	imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
	console.log(`Uploaded ${png}: ${asset.data.browser_download_url}`);
	}

	if (imageUrls.length > 0) {
	panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
	}
	} catch (e) {
	console.log(`Could not upload panel images: ${e.message}`);
	}
	}

	// Check for Grafana snapshot
	const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
	let artifactsSection = '';
	if (hasSnapshotJson \|\| hasPanels) {
	const items = [];
	if (hasSnapshotJson) {
	items.push('Grafana snapshot JSON');
	}
	artifactsSection = `\n\n📎 [Download artifacts](${artifactUrl})${items.length ? ' — ' + items.join(', ') : ''}`;
	}

	const body = `## Benchmark: scale-up-latency (Kind)

	${resultsTable}${panelImages}${artifactsSection}

	<details>
	<summary>Environment</summary>

	- Cluster: Kind (emulated GPUs)
	- Model: unsloth/Meta-Llama-3.1-8B (simulator)
	- Commit: ${sha.substring(0, 7)}
	- Scaler: prometheus-adapter
	- [Workflow run](${repoUrl}/actions/runs/${runId})

	</details>`;

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});

	- name: Cleanup Kind cluster
	if: always()
	run: kind delete cluster --name kind-wva-gpu-cluster \|\| true

	benchmark-openshift:
	runs-on: [self-hosted, openshift, vllm-d]
	needs: [gate, build-image]
	if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' \|\| github.event.inputs.platform == 'openshift')
	timeout-minutes: 60
	permissions:
	contents: write
	statuses: write
	pull-requests: write
	actions: read
	env:
	MODEL_ID: ${{ inputs.model_id \|\| 'unsloth/Meta-Llama-3.1-8B' }}
	ACCELERATOR_TYPE: 'H100'
	GOTOOLCHAIN: auto
	LLMD_NAMESPACE: llm-d-benchmark-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	WVA_NAMESPACE: wva-benchmark-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	WVA_RELEASE_NAME: wva-bench-${{ github.run_id }}
	WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
	steps:
	- name: Set pending status on PR head
	if: github.event_name == 'issue_comment'
	uses: actions/github-script@v7
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: '${{ needs.gate.outputs.pr_head_sha }}',
	state: 'pending',
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: 'Benchmark running on OpenShift...',
	context: '${{ github.workflow }} / benchmark-openshift'
	});

	- name: Checkout source
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.gate.outputs.pr_head_repo \|\| github.repository }}
	ref: ${{ needs.gate.outputs.pr_head_sha \|\| github.sha }}
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Set up Go
	uses: actions/setup-go@v6
	with:
	go-version: "1.25.x"
	cache-dependency-path: ./go.sum

	- name: Install tools (kubectl, oc, helm, make)
	run: \|
	sudo apt-get update && sudo apt-get install -y make
	KUBECTL_VERSION="v1.31.0"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
	tar -xzf openshift-client-linux.tar.gz
	sudo mv oc /usr/local/bin/
	rm -f openshift-client-linux.tar.gz kubectl README.md
	curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	- name: Verify cluster access
	run: \|
	kubectl cluster-info
	kubectl get nodes

	- name: Get HF token from cluster secret
	id: hf-token
	run: \|
	HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' \| base64 -d)
	echo "::add-mask::$HF_TOKEN"
	echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV

	- name: Clean up resources for this PR
	run: \|
	for ns in "$LLMD_NAMESPACE" "$WVA_NAMESPACE"; do
	if kubectl get namespace "$ns" &>/dev/null; then
	kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	for release in $(helm list -n "$ns" -q 2>/dev/null); do
	helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s \|\| true
	done
	kubectl delete namespace "$ns" --ignore-not-found --timeout=60s \|\| true
	fi
	done

	- name: Apply latest CRDs
	run: kubectl apply -f charts/workload-variant-autoscaler/crds/

	- name: Deploy WVA and llm-d infrastructure
	env:
	ENVIRONMENT: openshift
	INSTALL_GATEWAY_CTRLPLANE: "false"
	E2E_TESTS_ENABLED: "true"
	NAMESPACE_SCOPED: "false"
	LLMD_NS: ${{ env.LLMD_NAMESPACE }}
	WVA_NS: ${{ env.WVA_NAMESPACE }}
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	DEPLOY_VA: "false"
	DEPLOY_HPA: "false"
	DECODE_REPLICAS: "1"
	MONITORING_NAMESPACE: openshift-user-workload-monitoring
	WVA_METRICS_SECURE: "false"
	KV_CACHE_THRESHOLD: "0.90"
	QUEUE_LENGTH_THRESHOLD: "10"
	KV_SPARE_TRIGGER: "0.05"
	QUEUE_SPARE_TRIGGER: "2"
	VLLM_SVC_PORT: "8000"
	VLLM_MAX_NUM_SEQS: "1024"
	VLLM_GPU_MEM_UTIL: "0.95"
	VLLM_MAX_MODEL_LEN: "16000"
	VLLM_BLOCK_SIZE: "64"
	VLLM_ENFORCE_EAGER: "true"
	INSTALL_GRAFANA: "true"
	run: \|
	./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift

	- name: Label namespaces for OpenShift monitoring
	run: \|
	kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
	kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite

	- name: Wait for infrastructure to be ready
	run: \|
	kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s \|\| true
	kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s \|\| true
	echo "--- Services in openshift-user-workload-monitoring ---"
	kubectl get svc -n openshift-user-workload-monitoring
	echo "--- Services in openshift-monitoring ---"
	kubectl get svc -n openshift-monitoring

	- name: Run benchmark
	env:
	ENVIRONMENT: openshift
	USE_SIMULATOR: "false"
	SCALER_BACKEND: prometheus-adapter
	CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
	E2E_MONITORING_NAMESPACE: openshift-user-workload-monitoring
	E2E_EMULATED_LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
	BENCHMARK_GRAFANA_ENABLED: "true"
	BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
	BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
	BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
	KV_CACHE_THRESHOLD: "0.90"
	QUEUE_LENGTH_THRESHOLD: "10"
	KV_SPARE_TRIGGER: "0.05"
	QUEUE_SPARE_TRIGGER: "2"
	run: \|
	# Get token for Thanos querier
	export PROMETHEUS_TOKEN=$(kubectl create token prometheus-k8s -n openshift-monitoring --duration=24h 2>/dev/null \|\| echo "")

	# Start APIService guard: KEDA on this cluster continuously reclaims the
	# external.metrics.k8s.io APIService. This background loop re-patches it
	# every 8 seconds so the HPA can read wva_desired_replicas during the benchmark.
	# Key fix: caBundle must be set to null because KEDA sets it, and Kubernetes
	# rejects insecureSkipTLSVerify=true when caBundle is present.
	MONITORING_NS="openshift-user-workload-monitoring"
	(
	while true; do
	sleep 8
	current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.name}' 2>/dev/null)
	current_ns=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.namespace}' 2>/dev/null)
	if [ "$current_svc" != "prometheus-adapter" ] \|\| [ "$current_ns" != "$MONITORING_NS" ]; then
	echo "[apiservice-guard] KEDA reclaimed (now: $current_svc/$current_ns), re-patching..."
	kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{
	\"spec\": {
	\"caBundle\": null,
	\"insecureSkipTLSVerify\": true,
	\"service\": {
	\"name\": \"prometheus-adapter\",
	\"namespace\": \"$MONITORING_NS\"
	}
	}
	}" 2>&1 \|\| true
	fi
	done
	) &
	GUARD_PID=$!
	echo "APIService guard started (PID=$GUARD_PID)"

	# Give guard time to do initial patch if needed
	sleep 12
	echo "Checking external metrics API..."
	kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1" \| head -1 && echo "External metrics API: OK" \|\| echo "WARNING: External metrics API not available"

	TEST_EXIT=0
	make test-benchmark \|\| TEST_EXIT=$?

	kill $GUARD_PID 2>/dev/null \|\| true
	exit $TEST_EXIT

	- name: Generate benchmark plots
	if: always()
	run: \|
	echo "Installing matplotlib and numpy..."
	if python3 -m venv /tmp/plot-venv 2>&1; then
	/tmp/plot-venv/bin/pip install --quiet matplotlib numpy 2>&1
	PYTHON=/tmp/plot-venv/bin/python3
	else
	echo "venv failed, using PIP_BREAK_SYSTEM_PACKAGES fallback..."
	curl -sSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py
	PIP_BREAK_SYSTEM_PACKAGES=1 python3 /tmp/get-pip.py --user 2>&1
	PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip install --user matplotlib numpy 2>&1
	PYTHON=python3
	fi
	$PYTHON - <<'PLOTEOF'
	import json, os, sys
	try:
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker
	import numpy as np
	except ImportError:
	print("matplotlib not available, skipping plot generation")
	sys.exit(0)

	PANEL_DIR = '/tmp/benchmark-panels'
	PREFILL_FILE = '/tmp/prefill-benchmark-results.json'
	os.makedirs(PANEL_DIR, exist_ok=True)

	if not os.path.exists(PREFILL_FILE):
	print("No prefill results found, skipping plots")
	sys.exit(0)

	with open(PREFILL_FILE) as f:
	results = json.load(f)

	if not isinstance(results, list) or len(results) == 0:
	print("No prefill results found")
	sys.exit(0)

	plt.rcParams.update({
	'figure.facecolor': 'white', 'axes.facecolor': '#f8f9fa',
	'axes.grid': True, 'grid.alpha': 0.3, 'font.size': 12,
	'axes.titlesize': 14, 'axes.labelsize': 12, 'figure.dpi': 150,
	})
	WVA_C = '#2ecc71'
	EMPTY_METRIC = {'mean': 0, 'count': 0, 'percentiles': {k: 0 for k in ['p05','p10','p25','p50','p75','p90','p95','p99']}}

	def m(data, key):
	v = data.get(key)
	if isinstance(v, dict):
	if 'percentiles' not in v:
	v['percentiles'] = EMPTY_METRIC['percentiles']
	return v
	return EMPTY_METRIC

	from matplotlib.backends.backend_pdf import PdfPages

	model_id = os.environ.get('MODEL_ID', 'unknown')
	model_short = model_id.split('/')[-1]

	for data in results:
	atype = data.get('autoscaler_type', 'WVA')
	color = WVA_C

	tp_obj = m(data, 'throughput')
	ttft_obj = m(data, 'ttft')
	itl_obj = m(data, 'itl')
	tp_mean = tp_obj.get('mean', 0)
	ttft_mean = ttft_obj.get('mean', 0)
	ttft_p50 = ttft_obj.get('percentiles', {}).get('p50', 0)
	ttft_p99 = ttft_obj.get('percentiles', {}).get('p99', 0)
	itl_mean = itl_obj.get('mean', 0)
	itl_p50 = itl_obj.get('percentiles', {}).get('p50', 0)
	itl_p99 = itl_obj.get('percentiles', {}).get('p99', 0)
	completed = ttft_obj.get('count', 0)
	error_count = data.get('error_count', 0)
	incomplete_count = data.get('incomplete_count', 0)
	achieved_rps = data.get('achieved_rps', 0)
	error_rps = error_count / max(data.get('duration_sec', 1), 1)

	# --- Generate standalone PNG charts ---
	mt = data.get('metrics_timeline', [])
	tl = data.get('replica_timeline', [])

	if mt and tl:
	times_m = [s['elapsed_sec'] for s in mt]
	kv = [s['kv_cache']*100 for s in mt]
	qd = [s['queue_depth'] for s in mt]
	epp = [s.get('epp_queue_depth', 0) for s in mt]
	times_r = [s['elapsed_sec'] for s in tl]
	ready = [s['ready_replicas'] for s in tl]

	fig, ax = plt.subplots(figsize=(14, 5))
	ax.step(times_r, ready, where='post', color=color, linewidth=2.5, label='Ready Replicas')
	ax.fill_between(times_r, ready, step='post', alpha=0.15, color=color)
	ax.set_title(f'Replica Count Over Time (Unified Prefill+Decode) — {model_short}', fontsize=14, fontweight='bold')
	ax.set_xlabel('Time (seconds)'); ax.set_ylabel('Replicas')
	ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
	ax.legend(fontsize=11)
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-replica-timeline.png'), bbox_inches='tight', dpi=150)
	plt.close()

	fig, (ax_kv, ax_qd, ax_epp) = plt.subplots(3, 1, figsize=(14, 11), sharex=True)
	fig.suptitle(f'Metrics Over Time (Unified Prefill+Decode) — {model_short}', fontsize=15, fontweight='bold')
	ax_kv.plot(times_m, kv, color=color, linewidth=2); ax_kv.fill_between(times_m, kv, alpha=0.15, color=color)
	ax_kv.set_ylabel('KV Cache (%)'); ax_kv.set_title('KV Cache Usage')
	ax_qd.plot(times_m, qd, color='#e67e22', linewidth=2); ax_qd.fill_between(times_m, qd, alpha=0.15, color='#e67e22')
	ax_qd.set_ylabel('Requests Waiting'); ax_qd.set_title('vLLM Requests Waiting')
	ax_epp.plot(times_m, epp, color='#3498db', linewidth=2); ax_epp.fill_between(times_m, epp, alpha=0.15, color='#3498db')
	ax_epp.set_ylabel('EPP Queue Size'); ax_epp.set_xlabel('Time (seconds)'); ax_epp.set_title('EPP Flow Control Queue')
	fig.tight_layout()
	fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-metrics-timeline.png'), bbox_inches='tight', dpi=150)
	plt.close()

	print(f"Generated PNG charts in {PANEL_DIR}")
	for f in sorted(os.listdir(PANEL_DIR)):
	if f.endswith('.png'):
	print(f" {f}")

	# --- Generate PDF report (colleague format, 3 pages) ---
	pdf_path = os.path.join(PANEL_DIR, f'report_{atype.lower()}_{model_short}.pdf')
	with PdfPages(pdf_path) as pdf:
	# ===== PAGE 1: Configuration & Results Summary =====
	fig, ax = plt.subplots(figsize=(11, 8.5))
	ax.axis('off')
	sep = '='*90
	dash = '-'*90
	lines = []

	va_cfg = data.get('va_config', 'N/A')
	hpa_cfg = data.get('hpa_config', 'N/A')
	data_model = data.get('model_id', model_id)

	lines.append(sep)
	lines.append(f'AUTOSCALER TYPE : Workload Variant Autoscaler (WVA)')
	lines.append(f'MODEL : {data_model}')
	lines.append(sep)

	pods = data.get('pods', [])
	if pods:
	lines.append(f'{"Pod Name":<55} {"Node":<20} {"GPU":<25} {"Startup"}')
	lines.append(sep)
	for p in pods:
	startup = f'{p["startup_sec"]:.0f}s' if p['startup_sec'] > 0 else 'N/A'
	lines.append(f'{p["name"]:<55} {p["node"]:<20} {p["gpu"]:<25} {startup}')
	lines.append(sep)

	lines.append('EPP Configuration (Feature Gates & Scorer Weights)')
	lines.append(dash)
	lines.append(' featureGates: [flowControl]')
	lines.append(' queue-scorer: weight=2')
	lines.append(' kv-cache-utilization-scorer: weight=2')
	lines.append(' prefix-cache-scorer: weight=3')
	lines.append(sep)
	lines.append('Benchmark Load Generator Configuration')
	lines.append(dash)
	lines.append(f' Profile: poisson \| Rate: 20 req/s \| Max seconds: 600')
	lines.append(f' Prompt tokens: 4000 \| Output tokens: 1000 \| Seed: 42')
	lines.append(sep)
	lines.append('WVA Saturation Scaling Configuration')
	lines.append(dash)
	kv_thresh = os.environ.get('KV_CACHE_THRESHOLD', '0.80')
	queue_thresh = os.environ.get('QUEUE_LENGTH_THRESHOLD', '5')
	kv_spare = os.environ.get('KV_SPARE_TRIGGER', '0.1')
	queue_spare = os.environ.get('QUEUE_SPARE_TRIGGER', '3')
	lines.append(f' kvCacheThreshold: {kv_thresh} \| queueLengthThreshold: {queue_thresh}')
	lines.append(f' kvSpareTrigger: {kv_spare} \| queueSpareTrigger: {queue_spare}')
	lines.append(sep)
	lines.append('Autoscaling Configuration (HPA & VA)')
	lines.append(dash)
	lines.append(f' Variant (VA): {va_cfg}')
	lines.append(f' HPA: {hpa_cfg}')
	lines.append(sep)
	lines.append('True Serving Capacity Analysis (GuideLLM)')
	lines.append(dash)
	lines.append(f' Rate: 20.0 RPS \| Achieved: {achieved_rps:.2f} RPS \| Errors: {error_rps:.2f} RPS \| Tokens/s: {tp_mean:.2f}')
	lines.append(sep)

	sla_ttft = 50.0
	sla_itl = 50.0
	cost = 10.0
	ttft_penalty = ttft_p99 / sla_ttft if sla_ttft > 0 else 0
	itl_penalty = itl_p99 / sla_itl if sla_itl > 0 else 0
	avg_rep = data.get('avg_replicas', 0)
	latency_sub = ttft_penalty + itl_penalty
	resource_mult = avg_rep * cost
	score = resource_mult * latency_sub

	lines.append('Autoscaling Run Score (Lower is Better)')
	lines.append(dash)
	lines.append(f' Worst-Case P99 TTFT: {ttft_p99:.2f} ms')
	lines.append(f' Worst-Case P99 ITL : {itl_p99:.2f} ms')
	lines.append(f' Average Replicas : {avg_rep:.2f}')
	lines.append(f' Average EPP Queue : {data.get("avg_epp_queue_depth", 0):.2f}')
	lines.append(f' Target SLAs: TTFT = {sla_ttft:.0f}ms \| ITL = {sla_itl:.0f}ms')
	lines.append(f' Latency Penalty = ({ttft_p99:.2f}/{sla_ttft:.0f}) + ({itl_p99:.2f}/{sla_itl:.0f}) = {latency_sub:.2f}')
	lines.append(f' Resource Mult = {avg_rep:.2f} x {cost:.1f} = {resource_mult:.2f}')
	lines.append(f' => Final Score = {resource_mult:.2f} x {latency_sub:.2f} = {score:.2f}')
	lines.append(sep)

	ax.text(0.02, 0.98, '\n'.join(lines), transform=ax.transAxes, fontsize=7,
	verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
	fig.suptitle(f'WVA Benchmark Report (Saturation V1, Unified Prefill+Decode) — {model_short}', fontsize=14, fontweight='bold')
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# ===== PAGE 2: Time-series charts =====
	if mt and tl:
	fig, axes = plt.subplots(4, 1, figsize=(11, 14), sharex=True)
	fig.suptitle(f'Saturation V1 — Metrics Over Time (Unified Prefill+Decode, {model_short})', fontsize=14, fontweight='bold')

	axes[0].plot(times_m, kv, color=color, linewidth=2)
	axes[0].fill_between(times_m, kv, alpha=0.15, color=color)
	axes[0].set_ylabel('KV Cache Usage (%)')
	axes[0].set_title('Inference Pool Average KV Cache Usage Over Time')

	axes[1].plot(times_m, qd, color='#e67e22', linewidth=2)
	axes[1].fill_between(times_m, qd, alpha=0.15, color='#e67e22')
	axes[1].set_ylabel('Requests Waiting')
	axes[1].set_title('Number of Requests Waiting Over Time')

	axes[2].step(times_r, ready, where='post', color=color, linewidth=2.5, label='Actual Replicas')
	axes[2].fill_between(times_r, ready, step='post', alpha=0.1, color=color)
	ax2b = axes[2].twinx()
	ax2b.plot(times_m, epp, color='#3498db', linewidth=1.5, alpha=0.7, label='EPP Queue')
	ax2b.set_ylabel('EPP Queue Size', color='#3498db')
	axes[2].set_ylabel('Replica Count')
	axes[2].set_title('Decode Replica Count & EPP Queue Over Time')
	axes[2].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
	axes[2].legend(loc='upper left', fontsize=9)
	ax2b.legend(loc='upper right', fontsize=9)

	total_rps = achieved_rps + error_rps
	incomplete_rps = incomplete_count / max(data.get('duration_sec', 1), 1)
	axes[3].axhline(y=20, color='gray', linestyle='--', linewidth=1, label='Target 20.0 RPS')
	axes[3].bar(['Successful', 'Failed', 'Incomplete'],
	[achieved_rps, error_rps, incomplete_rps],
	color=[color, '#e74c3c', '#f39c12'], alpha=0.85, width=0.5)
	axes[3].set_ylabel('Requests/Second (RPS)')
	axes[3].set_title(f'GuideLLM Requests (Succeeded: {completed}, Failed: {error_count}, Incomplete: {incomplete_count})')
	axes[3].legend(fontsize=9)

	fig.tight_layout()
	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	# ===== PAGE 3: Latency & Throughput charts =====
	has_pcts = isinstance(ttft_obj, dict) and 'percentiles' in ttft_obj
	if has_pcts:
	fig = plt.figure(figsize=(11, 14))
	fig.suptitle(f'Saturation V1 — GuideLLM Latency & Throughput (Unified Prefill+Decode, {model_short})', fontsize=14, fontweight='bold')
	gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.3)

	ax_ttft = fig.add_subplot(gs[0, :])
	ax_ttft.set_yscale('log')
	ttft_vals_mean = [ttft_mean]
	ttft_vals_p99 = [ttft_p99]
	x_t = np.arange(1)
	w = 0.3
	ax_ttft.bar(x_t - w/2, ttft_vals_mean, w, label='Mean TTFT', color=color, alpha=0.85)
	ax_ttft.bar(x_t + w/2, ttft_vals_p99, w, label='P99 TTFT', color='#e74c3c', alpha=0.85)
	ax_ttft.set_xticks(x_t); ax_ttft.set_xticklabels([f'{20.0} RPS'])
	ax_ttft.set_title('Time To First Token (TTFT) per Run', fontweight='bold')
	ax_ttft.set_ylabel('TTFT (ms, log scale)')
	ax_ttft.legend(fontsize=9)

	ax_itl = fig.add_subplot(gs[1, 0])
	ax_itl.set_yscale('log')
	itl_vals_mean = [itl_mean]
	itl_vals_p99 = [itl_p99]
	ax_itl.bar(x_t - w/2, itl_vals_mean, w, label='Mean ITL', color=color, alpha=0.85)
	ax_itl.bar(x_t + w/2, itl_vals_p99, w, label='P99 ITL', color='#e74c3c', alpha=0.85)
	ax_itl.set_xticks(x_t); ax_itl.set_xticklabels([f'{20.0} RPS'])
	ax_itl.set_title('Inter-Token Latency (ITL) per Run', fontweight='bold')
	ax_itl.set_ylabel('ITL (ms, log scale)')
	ax_itl.legend(fontsize=9)

	ax_tp = fig.add_subplot(gs[1, 1])
	ax_tp.bar([f'{20.0} RPS'], [tp_mean], color=color, alpha=0.85, width=0.4)
	ax_tp.set_title('Overall Token Throughput per Run', fontweight='bold')
	ax_tp.set_ylabel('Tokens / Second')
	for i, v in enumerate([tp_mean]):
	ax_tp.text(i, v + tp_mean*0.02, f'{v:.0f}', ha='center', fontweight='bold')

	ax_conc = fig.add_subplot(gs[2, 0])
	if mt:
	conc_epp = [s.get('epp_queue_depth', 0) for s in mt]
	ax_conc.plot(times_m, conc_epp, color='#3498db', linewidth=2)
	ax_conc.fill_between(times_m, conc_epp, alpha=0.15, color='#3498db')
	ax_conc.set_title('Request Concurrency (EPP Queue)', fontweight='bold')
	ax_conc.set_ylabel('EPP Flow Control Queue Size')
	ax_conc.set_xlabel('Time (seconds)')

	ax_sum = fig.add_subplot(gs[2, 1])
	ax_sum.axis('off')
	summary_lines = [
	f'Completed : {completed}',
	f'Failed : {error_count}',
	f'Incomplete: {incomplete_count}',
	f'RPS : {achieved_rps:.2f}',
	f'',
	f'Throughput: {tp_mean:.0f} tok/s',
	f'TTFT mean : {ttft_mean/1000:.2f}s p99: {ttft_p99/1000:.2f}s',
	f'ITL mean : {itl_mean:.2f}ms p99: {itl_p99:.2f}ms',
	f'',
	f'Avg Replicas: {avg_rep:.2f}',
	f'Max Replicas: {data["max_replicas"]}',
	f'Avg KV Cache: {data["avg_kv_cache"]*100:.2f}%',
	f'Avg EPP Queue: {data.get("avg_epp_queue_depth", 0):.1f}',
	f'',
	f'Score: {score:.2f}',
	]
	ax_sum.text(0.1, 0.9, '\n'.join(summary_lines), transform=ax_sum.transAxes,
	fontsize=11, verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
	ax_sum.set_title('Summary', fontweight='bold')

	pdf.savefig(fig, bbox_inches='tight')
	plt.close()

	print(f" Generated PDF report: {pdf_path}")

	print(f"Generated all artifacts in {PANEL_DIR}")
	for f in sorted(os.listdir(PANEL_DIR)):
	if f.endswith('.png') or f.endswith('.pdf'):
	print(f" {f}")
	PLOTEOF

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results-openshift
	path: \|
	/tmp/benchmark-results.json
	/tmp/prefill-benchmark-results.json
	/tmp/benchmark-grafana-snapshot.txt
	/tmp/benchmark-grafana-snapshot.json
	/tmp/benchmark-panels/
	if-no-files-found: warn

	- name: Post benchmark results as PR comment
	if: always() && (github.event_name == 'issue_comment' \|\| needs.gate.outputs.pr_number != '')
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');
	const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
	const sha = '${{ needs.gate.outputs.pr_head_sha }}';
	const runId = context.runId;
	const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;

	let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
	try {
	const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
	owner: context.repo.owner,
	repo: context.repo.repo,
	run_id: runId
	});
	const benchArtifact = artifacts.find(a => a.name === 'benchmark-results-openshift');
	if (benchArtifact) {
	artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
	}
	} catch (e) {
	console.log(`Could not look up artifact: ${e.message}`);
	}

	let resultsTable = '';
	try {
	const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));
	const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;
	resultsTable = `\n### Scale-Up Latency\n\n\| Metric \| Value \|
	\|--------\|-------\|
	\| Scale-up time \| ${fmtTime(data.scaleUpTimeSec)} \|
	\| Scale-down time \| ${fmtTime(data.scaleDownTimeSec)} \|
	\| Max replicas \| ${data.maxReplicas} \|
	\| Avg KV cache usage \| ${data.avgKVCacheUsage.toFixed(3)} \|
	\| Avg queue depth \| ${data.avgQueueDepth.toFixed(1)} \|
	\| Replica oscillation (σ) \| ${data.replicaOscillation.toFixed(2)} \|
	\| Total duration \| ${data.totalDurationSec.toFixed(0)}s \|`;
	} catch (e) {
	console.log(`Scale-up latency results not found (skipped or not run): ${e.message}`);
	}

	let prefillSection = '';
	try {
	const prefillData = JSON.parse(fs.readFileSync('/tmp/prefill-benchmark-results.json', 'utf8'));
	if (Array.isArray(prefillData) && prefillData.length > 0) {
	const fmtP = (obj, key, div=1) => obj && obj.percentiles ? (obj.percentiles[key]/div).toFixed(1) : 'N/A';
	const fmtM = (obj, div=1, prec=1) => obj ? (obj.mean/div).toFixed(prec) : 'N/A';

	for (const r of prefillData) {
	const atype = r.autoscaler_type \|\| 'WVA';
	const modelId = r.model_id \|\| process.env.MODEL_ID \|\| 'unknown';

	let table = `\| Metric \| Value \|
	\|--------\|-------\|
	\| Model \| ${modelId} \|
	\| Duration \| ${r.duration_sec.toFixed(0)}s \|
	\| Max Replicas \| ${r.max_replicas} \|
	\| Avg Replicas \| ${r.avg_replicas.toFixed(2)} \|
	\| Avg vLLM Queue Depth \| ${r.avg_queue_depth.toFixed(1)} \|
	\| Avg EPP Queue Depth \| ${(r.avg_epp_queue_depth\|\|0).toFixed(1)} \|
	\| Avg KV Cache \| ${(r.avg_kv_cache*100).toFixed(2)}% \|
	\| TTFT mean \| ${fmtM(r.ttft, 1000)}s \|
	\| TTFT p50 \| ${fmtP(r.ttft, 'p50', 1000)}s \|
	\| TTFT p99 \| ${fmtP(r.ttft, 'p99', 1000)}s \|
	\| ITL mean \| ${fmtM(r.itl, 1, 2)}ms \|
	\| ITL p99 \| ${fmtP(r.itl, 'p99')}ms \|
	\| Throughput mean \| ${fmtM(r.throughput)} tok/s \|
	\| Completed Requests \| ${r.ttft ? r.ttft.count : 'N/A'} \|
	\| Failed Requests \| ${r.error_count \|\| 0} \|
	\| Incomplete Requests \| ${r.incomplete_count \|\| 0} \|
	\| Achieved RPS \| ${(r.achieved_rps \|\| 0).toFixed(2)} \|`;

	let podTable = '';
	if (r.pods && r.pods.length > 0) {
	podTable = `\n\n<details>\n<summary>Pod Placement (${r.pods.length} pods)</summary>\n\n\| Pod \| Node \| GPU \| Startup \|\n\|-----\|------\|-----\|---------\|\n`;
	for (const p of r.pods) {
	const startup = p.startup_sec > 0 ? `${p.startup_sec.toFixed(0)}s` : 'N/A';
	podTable += `\| ${p.name} \| ${p.node} \| ${p.gpu} \| ${startup} \|\n`;
	}
	podTable += `\n</details>`;
	}

	let timeline = '';
	if (r.replica_timeline && r.replica_timeline.length > 0) {
	timeline = `\n\n<details>\n<summary>Replica Timeline (${r.replica_timeline.length} snapshots)</summary>\n\n\| Time (s) \| Spec \| Ready \|\n\|----------\|------\|-------\|\n`;
	for (const s of r.replica_timeline) {
	timeline += `\| ${s.elapsed_sec.toFixed(0)} \| ${s.spec_replicas} \| ${s.ready_replicas} \|\n`;
	}
	timeline += `\n</details>`;
	}

	let configSection = `\n\n<details>\n<summary>Configuration</summary>\n\n`;
	configSection += `Scaling Engine: Saturation V1 (unified prefill+decode pods)\n\n`;
	configSection += `WVA Saturation Scaling Config:\n`;
	configSection += `\| Parameter \| Value \|\n\|-----------\|-------\|\n`;
	configSection += `\| kvCacheThreshold \| ${process.env.KV_CACHE_THRESHOLD \|\| '0.80'} \|\n`;
	configSection += `\| queueLengthThreshold \| ${process.env.QUEUE_LENGTH_THRESHOLD \|\| '5'} \|\n`;
	configSection += `\| kvSpareTrigger \| ${process.env.KV_SPARE_TRIGGER \|\| '0.1'} \|\n`;
	configSection += `\| queueSpareTrigger \| ${process.env.QUEUE_SPARE_TRIGGER \|\| '3'} \|\n\n`;
	configSection += `Autoscaling:\n`;
	configSection += `- VA: ${r.va_config \|\| 'N/A'}\n`;
	configSection += `- HPA: ${r.hpa_config \|\| 'N/A'}\n\n`;
	configSection += `EPP Configuration:\n`;
	configSection += `- Feature Gates: flowControl\n`;
	configSection += `- Scorers: queue-scorer (weight=2), kv-cache-utilization-scorer (weight=2), prefix-cache-scorer (weight=3)\n\n`;
	configSection += `Load Generator (GuideLLM):\n`;
	configSection += `- Profile: poisson @ 20 req/s \| Duration: 600s\n`;
	configSection += `- Prompt tokens: 4000 \| Output tokens: 1000 \| Seed: 42\n`;
	configSection += `\n</details>`;

	prefillSection += `\n\n---\n\n## WVA Benchmark: Prefill-Heavy Workload — Unified Prefill+Decode (${atype}, Saturation V1)\n\n${table}${podTable}${configSection}${timeline}`;
	}
	}
	} catch (e) {
	console.log(`Could not read prefill results: ${e.message}`);
	}

	let panelImages = '';
	const panelDir = '/tmp/benchmark-panels';
	const hasPanels = fs.existsSync(panelDir) && fs.readdirSync(panelDir).some(f => f.endsWith('.png'));

	if (hasPanels) {
	const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
	const tag = `benchmark-run-os-${runId}`;
	try {
	const release = await github.rest.repos.createRelease({
	owner: context.repo.owner,
	repo: context.repo.repo,
	tag_name: tag,
	name: `Benchmark panels OpenShift (PR #${prNumber}, ${sha.substring(0, 7)})`,
	body: `Auto-generated by benchmark CI run #${runId}`,
	draft: false,
	prerelease: true
	});

	const imageUrls = [];
	for (const png of pngs) {
	const filePath = path.join(panelDir, png);
	const fileData = fs.readFileSync(filePath);
	const asset = await github.rest.repos.uploadReleaseAsset({
	owner: context.repo.owner,
	repo: context.repo.repo,
	release_id: release.data.id,
	name: png,
	data: fileData,
	headers: { 'content-type': 'image/png' }
	});
	const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
	imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
	}

	if (imageUrls.length > 0) {
	panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
	}
	} catch (e) {
	console.log(`Could not upload panel images: ${e.message}`);
	}
	}

	const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
	let artifactsSection = '';
	if (hasSnapshotJson \|\| hasPanels) {
	const items = [];
	if (hasSnapshotJson) items.push('Grafana snapshot JSON');
	artifactsSection = `\n\n📎 [Download artifacts](${artifactUrl})${items.length ? ' — ' + items.join(', ') : ''}`;
	}

	const body = `## WVA Benchmark Results (OpenShift)

	${resultsTable}${prefillSection}${panelImages}${artifactsSection}

	<details>
	<summary>Environment</summary>

	- Cluster: OpenShift (Real GPUs)
	- Model: ${process.env.MODEL_ID \|\| 'unsloth/Meta-Llama-3.1-8B'}
	- Accelerator: H100
	- Commit: ${sha.substring(0, 7)}
	- Scaler: prometheus-adapter
	- [Workflow run](${repoUrl}/actions/runs/${runId})

	</details>`;

	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});

	- name: Cleanup infrastructure
	if: always()
	run: \|
	helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
	helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	done
	kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s \|\| true
	kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s \|\| true

	report-status:
	runs-on: ubuntu-latest
	needs: [gate, benchmark-kind, benchmark-openshift]
	if: always() && needs.gate.outputs.run_benchmark == 'true'
	permissions:
	statuses: write
	steps:
	- name: Report status to PR
	uses: actions/github-script@v7
	with:
	script: \|
	const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
	const platform = '${{ needs.gate.outputs.platform }}';

	let benchResult;
	if (platform === 'openshift') {
	benchResult = '${{ needs.benchmark-openshift.result }}';
	} else {
	benchResult = '${{ needs.benchmark-kind.result }}';
	}

	if (!prHeadSha) {
	console.log('No PR head SHA available, skipping status report');
	return;
	}

	let state, description;
	if (benchResult === 'success') {
	state = 'success';
	description = 'Benchmark completed successfully';
	} else if (benchResult === 'skipped') {
	state = 'failure';
	description = 'Benchmark did not run (prerequisite failed or skipped)';
	} else if (benchResult === 'cancelled') {
	state = 'failure';
	description = 'Benchmark cancelled';
	} else {
	state = 'failure';
	description = 'Benchmark failed';
	}

	console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);

	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: prHeadSha,
	state: state,
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: description,
	context: '${{ github.workflow }} / benchmark-kind'
	});

	console.log('Status reported successfully');

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Workflow file

Uh oh!

Workflow file for this run