Add optimization loop performance metrics #2954

Workflow file for this run

.github/workflows/ci-e2e-openshift.yaml at 36723de

	name: CI - OpenShift E2E Tests

	# Permissions needed for various jobs
	permissions:
	contents: read
	packages: write
	pull-requests: write # For posting comments on PRs
	statuses: write # For reporting status on fork PR commits

	# Cancel previous runs on the same PR to avoid resource conflicts
	# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments)
	# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
	#
	# Logic:
	# - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests
	# - Valid triggers: group 'e2e-openshift-{pr_number}' (can cancel previous runs for same PR)
	# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
	#
	# NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation (line ~125)
	concurrency:
	group: >-
	${{
	github.event_name == 'issue_comment' &&
	!contains(github.event.comment.body, '/ok-to-test') &&
	!contains(github.event.comment.body, '/retest')
	&& format('comment-isolated-{0}', github.run_id)
	\|\| format('e2e-openshift-{0}',
	github.event.pull_request.number
	\|\| github.event.issue.number
	\|\| github.run_id)
	}}
	cancel-in-progress: true

	on:
	pull_request:
	branches:
	- main
	- dev
	# Allow maintainers to trigger tests on fork PRs via /ok-to-test comment
	issue_comment:
	types: [created]
	workflow_dispatch:
	inputs:
	model_id:
	description: 'Model ID'
	required: false
	default: 'unsloth/Meta-Llama-3.1-8B'
	accelerator_type:
	description: 'Accelerator type (H100, A100, L40S)'
	required: false
	default: 'H100'
	request_rate:
	description: 'Request rate (req/s)'
	required: false
	default: '20'
	num_prompts:
	description: 'Number of prompts'
	required: false
	default: '3000'
	skip_cleanup:
	description: 'Skip cleanup after tests'
	required: false
	default: 'false'
	max_num_seqs:
	description: 'vLLM max batch size (lower = easier to saturate)'
	required: false
	default: '1'
	hpa_stabilization_seconds:
	description: 'HPA stabilization window in seconds'
	required: false
	default: '240'

	jobs:
	# Check if PR contains code changes (not just docs/metadata)
	check-code-changes:
	runs-on: ubuntu-latest
	permissions:
	contents: read
	pull-requests: read
	outputs:
	has_code_changes: ${{ steps.set-output.outputs.has_code_changes }}
	steps:
	- name: Checkout source
	if: github.event_name == 'pull_request'
	uses: actions/checkout@v4

	- name: Check for code changes
	if: github.event_name == 'pull_request'
	uses: dorny/paths-filter@v3
	id: filter
	with:
	filters: \|
	code:
	- '!docs/**'
	- '!README.md'
	- '!CONTRIBUTING.md'
	- '!LICENSE'
	- '!OWNERS'
	- '!PROJECT'

	- name: Set output
	id: set-output
	run: \|
	if [ "${{ github.event_name }}" != "pull_request" ]; then
	# Always run for issue_comment (/ok-to-test, /retest) and workflow_dispatch
	echo "has_code_changes=true" >> $GITHUB_OUTPUT
	elif [ -n "${{ steps.filter.outputs.code }}" ]; then
	echo "has_code_changes=${{ steps.filter.outputs.code }}" >> $GITHUB_OUTPUT
	else
	echo "has_code_changes=true" >> $GITHUB_OUTPUT
	fi

	# Gate: Check permissions and handle /ok-to-test for fork PRs.
	# - Maintainers (write access): Tests run automatically on pull_request.
	# - Fork PRs: Gate succeeds (no failure) so the PR does not show a false red check; E2E runs
	# only after a maintainer comments /ok-to-test. Branch protection should require the
	# "e2e-openshift" job so merge stays blocked until that run passes.
	gate:
	needs: check-code-changes
	if: needs.check-code-changes.outputs.has_code_changes == 'true'
	runs-on: ubuntu-latest
	outputs:
	should_run: ${{ steps.check.outputs.should_run }}
	pr_number: ${{ steps.check.outputs.pr_number }}
	pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
	is_fork_pr: ${{ steps.check.outputs.is_fork_pr }}
	steps:
	- name: Check permissions and OpenShift E2E triggers (/ok-to-test, /retest)
	id: check
	uses: actions/github-script@v7
	with:
	script: \|
	// Helper to check if user has write access
	async function hasWriteAccess(username) {
	try {
	const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	username: username
	});
	const privilegedRoles = ['admin', 'maintain', 'write'];
	return privilegedRoles.includes(permission.permission);
	} catch (e) {
	console.log(`Could not get permissions for ${username}: ${e.message}`);
	return false;
	}
	}

	// Always run for workflow_dispatch
	if (context.eventName === 'workflow_dispatch') {
	core.setOutput('should_run', 'true');
	core.setOutput('pr_number', '');
	core.setOutput('pr_head_sha', context.sha);
	core.setOutput('is_fork_pr', 'false');
	return;
	}

	// Handle issue_comment event (/ok-to-test or /retest)
	if (context.eventName === 'issue_comment') {
	const comment = context.payload.comment.body.trim();
	const issue = context.payload.issue;

	// Only process /ok-to-test or /retest comments on PRs
	if (!issue.pull_request) {
	console.log('Comment is not on a PR, skipping');
	core.setOutput('should_run', 'false');
	return;
	}

	// NOTE: This list must stay in sync with concurrency group logic (lines 23-25)
	const validCommands = ['/ok-to-test', '/retest'];
	if (!validCommands.includes(comment)) {
	console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
	core.setOutput('should_run', 'false');
	return;
	}

	// Check if commenter has write access
	const commenter = context.payload.comment.user.login;
	const hasAccess = await hasWriteAccess(commenter);
	if (!hasAccess) {
	console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
	core.setOutput('should_run', 'false');
	return;
	}

	// Get PR details to get head SHA
	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: issue.number
	});

	// Check if PR is from a fork
	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
	const isFork = headRepo !== baseRepo;

	console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
	console.log(`PR head SHA: ${pr.head.sha}`);
	console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
	core.setOutput('should_run', 'true');
	core.setOutput('pr_number', issue.number.toString());
	core.setOutput('pr_head_sha', pr.head.sha);
	core.setOutput('is_fork_pr', isFork ? 'true' : 'false');

	// Add reaction to acknowledge
	await github.rest.reactions.createForIssueComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: context.payload.comment.id,
	content: 'rocket'
	});

	// Post comment with link to the e2e workflow run
	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	const cmdDesc = comment === '/ok-to-test' ? 'approve and run' : 're-run';
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: issue.number,
	body: `🚀 OpenShift E2E — ${cmdDesc} (\`${comment}\`)\n\n[View the OpenShift E2E workflow run](${runUrl})`
	});
	return;
	}

	// Handle pull_request event
	const pr = context.payload.pull_request;
	const prAuthor = pr.user.login;
	const prNumber = pr.number;
	const prHeadSha = pr.head.sha;

	// Check if PR is from a fork
	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
	const isFork = headRepo !== baseRepo;
	console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);

	core.setOutput('pr_number', prNumber.toString());
	core.setOutput('pr_head_sha', prHeadSha);
	core.setOutput('is_fork_pr', isFork ? 'true' : 'false');

	// Check if PR author has write access
	const isPrivileged = await hasWriteAccess(prAuthor);
	console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`);

	// Check if we already posted a bot comment
	const comments = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber
	});

	const botComment = comments.data.find(c =>
	c.user.type === 'Bot' &&
	c.body.includes('ok-to-test')
	);

	// Helper to safely post a comment (may fail on fork PRs due to permissions)
	async function tryPostComment(body) {
	try {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});
	return true;
	} catch (e) {
	// Fork PRs can't post comments on pull_request event (GitHub security restriction)
	console.log(`Could not post comment (expected for fork PRs): ${e.message}`);
	return false;
	}
	}

	if (isPrivileged) {
	// For maintainer/admin fork PRs, we need to trigger via /ok-to-test
	// because fork PRs don't have access to secrets on pull_request event
	if (isFork) {
	console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`);
	core.setOutput('should_run', 'false'); // Don't run on pull_request event

	// Auto-post /ok-to-test to trigger issue_comment workflow
	if (!botComment) {
	const posted = await tryPostComment(`/ok-to-test`);
	if (!posted) {
	console.log('Note: Maintainer will need to manually comment /ok-to-test');
	}
	}

	// Do not fail the gate: fork PRs cannot run E2E on pull_request (no secrets).
	// Gate succeeds so the PR does not show a false failure; branch protection
	// should require "e2e-openshift" so merge stays blocked until /ok-to-test run passes.
	return;
	}
	// Non-fork PR from maintainer - run directly
	core.setOutput('should_run', 'true');
	return;
	}

	// External contributor - post instructions and skip
	console.log('External contributor PR - posting instructions');
	core.setOutput('should_run', 'false');

	if (!botComment) {
	const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so OpenShift E2E (GPU) tests require approval to run.\n\nFor maintainers/admins: Comment \`/ok-to-test\` to approve and trigger OpenShift E2E on this PR, or \`/retest\` to re-run OpenShift E2E (e.g. after a failure or new commits).\n\nFor contributors: Please wait for a maintainer or admin to approve running the tests.`);
	if (!posted) {
	console.log('Note: Could not post instructions comment on fork PR');
	}
	}

	// Do not fail the gate: GitHub does not allow updating status from upstream on fork
	// PRs, so a failed gate would stay red even after /ok-to-test run passes. Let the gate
	// succeed; branch protection should require "e2e-openshift" so merge stays blocked
	// until a maintainer comments /ok-to-test and E2E passes.

	- name: Write workflow summary
	if: always()
	uses: actions/github-script@v7
	with:
	script: \|
	const shouldRun = '${{ steps.check.outputs.should_run }}';
	const isFork = '${{ steps.check.outputs.is_fork_pr }}';
	const eventName = '${{ github.event_name }}';
	if (shouldRun === 'true') {
	core.summary.addRaw('✅ E2E tests will run for this trigger.\n').write();
	} else if (isFork === 'true' && eventName === 'pull_request') {
	core.summary.addRaw([
	'⏸️ E2E tests skipped — fork PR\n\n',
	'Fork PRs cannot run E2E on `pull_request` events (no access to secrets/GPU runners).\n\n',
	'A maintainer must comment \`/ok-to-test\` to trigger the OpenShift E2E suite. ',
	'Branch protection should require e2e-openshift so merge stays blocked until E2E passes.\n',
	].join('')).write();
	} else {
	core.summary.addRaw('⏸️ E2E tests were skipped (gate check did not pass for this trigger).\n').write();
	}

	# Build the WVA controller image on GitHub-hosted runner (has proper Docker setup)
	# Note: Skip for fork PRs on pull_request event (no secrets access).
	# For fork PRs, build-image runs via issue_comment trigger (/ok-to-test).
	build-image:
	needs: gate
	if: \|
	needs.gate.outputs.should_run == 'true' &&
	(needs.gate.outputs.is_fork_pr != 'true' \|\| github.event_name != 'pull_request')
	runs-on: ubuntu-latest
	outputs:
	image_tag: ${{ steps.build.outputs.image_tag }}
	steps:
	- name: Checkout source
	uses: actions/checkout@v4
	with:
	# Use PR head SHA from gate (works for both pull_request and issue_comment)
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Log in to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ secrets.CR_USER }}
	password: ${{ secrets.CR_TOKEN }}

	- name: Build and push image
	id: build
	env:
	REGISTRY: ghcr.io
	IMAGE_NAME: ${{ github.repository }}
	# Use PR head SHA from gate
	GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	# Build image with git ref tag for this PR
	# Use first 8 chars of the git ref (POSIX-compliant)
	IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" \| cut -c1-8)"
	FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
	echo "Building image: $FULL_IMAGE"
	echo "Git ref: $GIT_REF"

	# Build and push using make targets
	make docker-build IMG="$FULL_IMAGE"
	make docker-push IMG="$FULL_IMAGE"

	echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
	echo "Image built and pushed: $FULL_IMAGE"

	# Run e2e tests on OpenShift self-hosted runner (vllm-d cluster).
	# pok-prod runners are reserved for nightly E2E only.
	e2e-openshift:
	runs-on: [self-hosted, openshift, vllm-d]
	needs: [gate, build-image]
	if: needs.gate.outputs.should_run == 'true'
	env:
	MODEL_ID: ${{ github.event.inputs.model_id \|\| 'unsloth/Meta-Llama-3.1-8B' }}
	GOTOOLCHAIN: auto
	ACCELERATOR_TYPE: ${{ github.event.inputs.accelerator_type \|\| 'A100' }}
	REQUEST_RATE: ${{ github.event.inputs.request_rate \|\| '20' }}
	NUM_PROMPTS: ${{ github.event.inputs.num_prompts \|\| '3000' }}
	MAX_NUM_SEQS: ${{ github.event.inputs.max_num_seqs \|\| '5' }}
	HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds \|\| '240' }}
	SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup \|\| 'false' }}
	# Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support)
	LLM_D_RELEASE: main
	# PR-specific namespaces for isolation between concurrent PR tests
	# Primary llm-d namespace (Model A1 + A2)
	LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	# Secondary llm-d namespace (Model B)
	LLMD_NAMESPACE_B: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}-b
	# WVA controller namespace (monitors all models)
	WVA_NAMESPACE: llm-d-autoscaler-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	# Unique release names per run to avoid conflicts
	WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }}
	# Model A1: Primary deployment in LLMD_NAMESPACE
	MODEL_A1_RELEASE: model-a1-${{ github.run_id }}
	# Model B: Deployment in LLMD_NAMESPACE_B
	MODEL_B_RELEASE: model-b-${{ github.run_id }}
	# Use the image built in the previous job
	WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
	steps:
	- name: Checkout source
	uses: actions/checkout@v4
	with:
	# Use PR head SHA from gate (works for both pull_request and issue_comment)
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Set up Go
	uses: actions/setup-go@v6
	with:
	go-version: "1.25.x"
	cache-dependency-path: ./go.sum
	- name: Verify Go toolchain
	run: \|
	which go
	go version
	go env GOTOOLCHAIN
	- name: Install tools (kubectl, oc, helm, make)
	run: \|
	sudo apt-get update && sudo apt-get install -y make
	# Install kubectl - use pinned version for reproducible CI builds
	# Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+
	# Update this version when upgrading target cluster or during regular dependency reviews
	KUBECTL_VERSION="v1.31.0"
	echo "Installing kubectl version: $KUBECTL_VERSION"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256"
	echo "$(cat kubectl.sha256) kubectl" \| sha256sum --check
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	rm -f kubectl.sha256
	# Install oc (OpenShift CLI)
	curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
	tar -xzf openshift-client-linux.tar.gz
	sudo mv oc /usr/local/bin/
	rm -f openshift-client-linux.tar.gz kubectl README.md
	# Install helm
	curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	- name: Verify cluster access
	run: \|
	echo "Verifying cluster access..."
	kubectl cluster-info
	kubectl get nodes

	- name: Verify correct cluster (vllm-d, not pok-prod)
	run: \|
	# PR E2E tests must run on the vllm-d cluster, not pok-prod-sa.
	# pok-prod-sa is reserved for nightly E2E runs only.
	# Runners with the 'pok-prod' label connect to pok-prod-sa;
	# runners without it connect to vllm-d.
	CLUSTER_API=$(kubectl cluster-info 2>/dev/null \| head -1 \| grep -oE 'https://[^ ]+')
	echo "Cluster API: $CLUSTER_API"
	if echo "$CLUSTER_API" \| grep -q "pokprod"; then
	echo "::error::This runner is connected to pok-prod-sa, but PR E2E tests must run on vllm-d."
	echo "::error::The runner likely has the 'pok-prod' label. PR CI should only use vllm-d runners."
	exit 1
	fi
	echo "Cluster verified: running on vllm-d"

	- name: Check GPU availability
	id: gpu-check
	run: \|
	echo "Checking GPU availability for e2e test..."

	# Minimum GPUs needed: 2 models × 2 GPUs each = 4
	# Recommended with scale-up headroom: 6
	REQUIRED_GPUS=4
	RECOMMENDED_GPUS=6

	# Total allocatable GPUs across all nodes
	TOTAL_GPUS=$(kubectl get nodes -o json \| \
	jq '[.items[].status.allocatable["nvidia.com/gpu"] // "0" \| tonumber] \| add // 0')

	# Currently requested GPUs by all pods
	ALLOCATED_GPUS=$(kubectl get pods --all-namespaces -o json \| \
	jq '[.items[] \| select(.status.phase == "Running" or .status.phase == "Pending") \| .spec.containers[]?.resources.requests["nvidia.com/gpu"] // "0" \| tonumber] \| add // 0')

	AVAILABLE_GPUS=$((TOTAL_GPUS - ALLOCATED_GPUS))

	# Total allocatable CPU (cores) and memory (Gi) across all nodes
	# CPU may be in millicores (e.g. "8000m") or cores (e.g. "8")
	TOTAL_CPU=$(kubectl get nodes -o json \| \
	jq '[.items[].status.allocatable.cpu // "0" \| if endswith("m") then (gsub("m$";"") \| tonumber / 1000) else tonumber end] \| add \| floor')
	TOTAL_MEM_KI=$(kubectl get nodes -o json \| \
	jq '[.items[].status.allocatable.memory // "0" \| gsub("[^0-9]";"") \| tonumber] \| add')
	TOTAL_MEM_GI=$((TOTAL_MEM_KI / 1048576))

	NODE_COUNT=$(kubectl get nodes --no-headers \| wc -l \| tr -d ' ')
	GPU_NODE_COUNT=$(kubectl get nodes -o json \| \
	jq '[.items[] \| select((.status.allocatable["nvidia.com/gpu"] // "0" \| tonumber) > 0)] \| length')

	# Export all values for the PR comment step
	echo "total_gpus=$TOTAL_GPUS" >> $GITHUB_OUTPUT
	echo "allocated_gpus=$ALLOCATED_GPUS" >> $GITHUB_OUTPUT
	echo "available_gpus=$AVAILABLE_GPUS" >> $GITHUB_OUTPUT
	echo "total_cpu=$TOTAL_CPU" >> $GITHUB_OUTPUT
	echo "total_mem_gi=$TOTAL_MEM_GI" >> $GITHUB_OUTPUT
	echo "node_count=$NODE_COUNT" >> $GITHUB_OUTPUT
	echo "gpu_node_count=$GPU_NODE_COUNT" >> $GITHUB_OUTPUT
	echo "required_gpus=$REQUIRED_GPUS" >> $GITHUB_OUTPUT
	echo "recommended_gpus=$RECOMMENDED_GPUS" >> $GITHUB_OUTPUT

	echo "## GPU Status" >> $GITHUB_STEP_SUMMARY
	echo "\| Metric \| Count \|" >> $GITHUB_STEP_SUMMARY
	echo "\|--------\|-------\|" >> $GITHUB_STEP_SUMMARY
	echo "\| Total cluster GPUs \| $TOTAL_GPUS \|" >> $GITHUB_STEP_SUMMARY
	echo "\| Currently allocated \| $ALLOCATED_GPUS \|" >> $GITHUB_STEP_SUMMARY
	echo "\| Available \| $AVAILABLE_GPUS \|" >> $GITHUB_STEP_SUMMARY
	echo "\| Required (minimum) \| $REQUIRED_GPUS \|" >> $GITHUB_STEP_SUMMARY
	echo "\| Recommended (with scale-up) \| $RECOMMENDED_GPUS \|" >> $GITHUB_STEP_SUMMARY

	if [ "$AVAILABLE_GPUS" -lt "$REQUIRED_GPUS" ]; then
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "❌ Insufficient GPUs — need $REQUIRED_GPUS but only $AVAILABLE_GPUS available. Re-run when GPUs free up." >> $GITHUB_STEP_SUMMARY
	echo "::error::Insufficient GPUs: need $REQUIRED_GPUS, have $AVAILABLE_GPUS available. Try again later."
	echo "gpu_available=false" >> $GITHUB_OUTPUT
	exit 1
	elif [ "$AVAILABLE_GPUS" -lt "$RECOMMENDED_GPUS" ]; then
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "⚠️ Low GPU headroom — $AVAILABLE_GPUS available (need $RECOMMENDED_GPUS for scale-up tests). Tests may fail during scale-up." >> $GITHUB_STEP_SUMMARY
	echo "::warning::Low GPU headroom: $AVAILABLE_GPUS available, $RECOMMENDED_GPUS recommended for scale-up tests"
	echo "gpu_available=true" >> $GITHUB_OUTPUT
	else
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "✅ GPUs available — $AVAILABLE_GPUS GPUs free ($REQUIRED_GPUS required, $RECOMMENDED_GPUS recommended)" >> $GITHUB_STEP_SUMMARY
	echo "gpu_available=true" >> $GITHUB_OUTPUT
	fi

	- name: Post GPU status to PR
	if: always() && needs.gate.outputs.pr_number != ''
	continue-on-error: true
	env:
	GH_TOKEN: ${{ github.token }}
	PR_NUMBER: ${{ needs.gate.outputs.pr_number }}
	run: \|
	GPU_STATUS="${{ steps.gpu-check.outcome }}"
	GPU_AVAIL="${{ steps.gpu-check.outputs.gpu_available }}"
	TOTAL_GPUS="${{ steps.gpu-check.outputs.total_gpus }}"
	ALLOCATED_GPUS="${{ steps.gpu-check.outputs.allocated_gpus }}"
	AVAILABLE_GPUS="${{ steps.gpu-check.outputs.available_gpus }}"
	TOTAL_CPU="${{ steps.gpu-check.outputs.total_cpu }}"
	TOTAL_MEM_GI="${{ steps.gpu-check.outputs.total_mem_gi }}"
	NODE_COUNT="${{ steps.gpu-check.outputs.node_count }}"
	GPU_NODE_COUNT="${{ steps.gpu-check.outputs.gpu_node_count }}"
	REQUIRED_GPUS="${{ steps.gpu-check.outputs.required_gpus }}"
	RECOMMENDED_GPUS="${{ steps.gpu-check.outputs.recommended_gpus }}"

	NL=$'\n'
	TABLE="\| Resource \| Total \| Allocated \| Available \|${NL}\|----------\|-------\|-----------\|----------\|${NL}\| GPUs \| $TOTAL_GPUS \| $ALLOCATED_GPUS \| $AVAILABLE_GPUS \|${NL}${NL}\| Cluster \| Value \|${NL}\|---------\|-------\|${NL}\| Nodes \| $NODE_COUNT ($GPU_NODE_COUNT with GPUs) \|${NL}\| Total CPU \| ${TOTAL_CPU} cores \|${NL}\| Total Memory \| ${TOTAL_MEM_GI} Gi \|${NL}\| GPUs required \| $REQUIRED_GPUS (min) / $RECOMMENDED_GPUS (recommended) \|"

	if [ "$GPU_STATUS" = "failure" ]; then
	HEADER="### GPU Pre-flight Check ❌"
	MSG="Insufficient GPUs to run OpenShift E2E. Re-run with \`/retest\` (OpenShift E2E) when GPUs free up."
	elif [ "$GPU_AVAIL" = "true" ]; then
	HEADER="### GPU Pre-flight Check ✅"
	MSG="GPUs are available for e2e-openshift tests. Proceeding with deployment."
	else
	HEADER="### GPU Pre-flight Check ⚠️"
	MSG="Low GPU headroom — tests may fail during scale-up phases."
	fi

	BODY="${HEADER}${NL}${MSG}${NL}${NL}${TABLE}"
	PAYLOAD=$(jq -n --arg body "$BODY" '{"body": $body}')
	curl -s -X POST \
	-H "Authorization: token $GH_TOKEN" \
	-H "Accept: application/vnd.github.v3+json" \
	"https://api.github.com/repos/${{ github.repository }}/issues/$PR_NUMBER/comments" \
	-d "$PAYLOAD"

	- name: Get HF token from cluster secret
	id: hf-token
	run: \|
	echo "Reading HF token from cluster secret llm-d-hf-token in default namespace..."
	# The llm-d-hf-token secret exists in the default namespace on the cluster
	# Check secret existence separately from key retrieval for better error messages
	if ! kubectl get secret llm-d-hf-token -n default &>/dev/null; then
	echo "::error::Secret 'llm-d-hf-token' not found in default namespace"
	echo "::error::Please ensure the HF token secret exists on the cluster"
	exit 1
	fi
	# Read the token and mask it in logs
	HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' \| base64 -d)
	if [ -z "$HF_TOKEN" ]; then
	echo "::error::Secret 'llm-d-hf-token' exists but 'HF_TOKEN' key is empty or missing"
	exit 1
	fi
	# Mask the token in workflow logs
	echo "::add-mask::$HF_TOKEN"
	# Export for subsequent steps
	echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV
	echo "HF token retrieved successfully from cluster secret"

	- name: Clean up resources for this PR
	run: \|
	echo "Cleaning up WVA resources for this PR's namespaces only..."
	echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
	echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
	echo " WVA_NAMESPACE: $WVA_NAMESPACE"

	# Only clean up the 3 namespaces associated with THIS PR
	# Do NOT touch namespaces from other PRs to avoid race conditions
	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
	if kubectl get namespace "$ns" &>/dev/null; then
	echo ""
	echo "=== Cleaning up namespace: $ns ==="
	# Delete WVA resources in this namespace
	echo " Removing HPAs and VAs..."
	kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found \|\| true
	# Uninstall all helm releases in the namespace
	for release in $(helm list -n "$ns" -q 2>/dev/null); do
	echo " Uninstalling helm release: $release"
	helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s \|\| true
	done
	echo " Deleting namespace: $ns"
	kubectl delete namespace "$ns" --ignore-not-found --timeout=60s \|\| true
	else
	echo "Namespace $ns does not exist, skipping cleanup"
	fi
	done

	# Clean up legacy namespaces if they exist (these are not PR-specific)
	for legacy_ns in llm-d-inference-scheduler workload-variant-autoscaler-system; do
	if kubectl get namespace "$legacy_ns" &>/dev/null; then
	echo ""
	echo "=== Cleaning up legacy namespace: $legacy_ns ==="
	# Uninstall all helm releases in the namespace first
	for release in $(helm list -n "$legacy_ns" -q 2>/dev/null); do
	echo " Uninstalling helm release: $release"
	helm uninstall "$release" -n "$legacy_ns" --ignore-not-found --wait --timeout 60s \|\| true
	done
	echo " Deleting namespace: $legacy_ns"
	kubectl delete namespace "$legacy_ns" --ignore-not-found --timeout=60s \|\| true
	fi
	done

	# The helmfile uses a generic release name "workload-variant-autoscaler" which
	# produces non-unique ClusterRole names. On shared clusters, these resources
	# may be owned by another namespace's release, causing Helm ownership conflicts.
	# Fix: adopt them for our namespace so helmfile can proceed. Post-cleanup will
	# delete them, and the next user's helmfile run will recreate them fresh.
	# Only adopt legacy helmfile-style names (release "workload-variant-autoscaler").
	# PR-specific Helm releases use names like wva-e2e-<run_id>; those live in WVA_NAMESPACE.
	# Re-annotating them to LLMD_NAMESPACE breaks Helm ownership and can leave the controller
	# ServiceAccount bound to a wrong or unmanaged ClusterRole (cluster-wide list/watch denied).
	echo "Adopting shared WVA cluster-scoped resources for namespace $LLMD_NAMESPACE..."
	for kind in clusterrole clusterrolebinding; do
	kubectl get "$kind" -o json 2>/dev/null \| \
	jq -r '.items[] \| select(.metadata.name \| contains("workload-variant-autoscaler")) \| select(.metadata.name \| startswith("wva-e2e-") \| not) \| select(.metadata.annotations["meta.helm.sh/release-namespace"] != null) \| .metadata.name' 2>/dev/null \| \
	while read -r name; do
	current_ns=$(kubectl get "$kind" "$name" -o json 2>/dev/null \| jq -r '.metadata.annotations["meta.helm.sh/release-namespace"] // ""')
	if [ "$current_ns" != "$LLMD_NAMESPACE" ]; then
	echo " Adopting $kind/$name (was owned by '$current_ns')"
	kubectl annotate "$kind" "$name" \
	"meta.helm.sh/release-name=workload-variant-autoscaler" \
	"meta.helm.sh/release-namespace=$LLMD_NAMESPACE" \
	--overwrite \|\| true
	fi
	done
	done

	echo ""
	echo "Cleanup complete for this PR's namespaces"

	- name: Apply latest CRDs
	run: \|
	echo "Applying latest VariantAutoscaling CRD..."
	# Helm doesn't auto-update CRDs, so we need to apply them manually
	# to ensure the cluster has the latest schema (including scaleTargetRef)
	kubectl apply -f charts/workload-variant-autoscaler/crds/

	- name: Deploy WVA and llm-d infrastructure
	env:
	# HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step)
	ENVIRONMENT: openshift
	INSTALL_GATEWAY_CTRLPLANE: "false"
	E2E_TESTS_ENABLED: "true"
	# OpenShift typically lacks HPAScaleToZero; e2e forces SCALE_TO_ZERO_ENABLED off for openshift
	# (see test/e2e/config.go). KEDA ScaledObjects support minReplicas=0 for scale-from-zero tests.
	SCALER_BACKEND: keda
	NAMESPACE_SCOPED: "false"
	# Pass PR-specific namespaces to install script
	LLMD_NS: ${{ env.LLMD_NAMESPACE }}
	WVA_NS: ${{ env.WVA_NAMESPACE }}
	# Controller instance label for multi-controller isolation in parallel e2e tests
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	# Skip infra VA/HPA — the smoke test creates its own VA+HPA targeting
	# its own deployment. The infra VA adds a second idle pod to the
	# saturation analysis group, diluting KV cache metrics and preventing
	# scale-up from triggering.
	DEPLOY_VA: "false"
	DEPLOY_HPA: "false"
	# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
	VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
	# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
	DECODE_REPLICAS: "1"
	# OpenShift uses built-in user-workload monitoring, not a separate namespace
	MONITORING_NAMESPACE: openshift-user-workload-monitoring
	# Disable bearer token auth on WVA /metrics endpoint — OpenShift's
	# user-workload-monitoring cannot authenticate with the controller-manager
	# SA token. The endpoint is still only accessible within the cluster network.
	WVA_METRICS_SECURE: "false"
	# Lower saturation thresholds for simulator mode — the simulator's
	# KV-cache and queue metrics are modest, so default thresholds
	# (kvSpareTrigger=0.1, queueSpareTrigger=3) are too high to trigger
	# scale-up reliably. These values trigger when kvUsage > 0.30 or
	# queueLength > 0.5, which the simulator produces under load.
	KV_SPARE_TRIGGER: "0.5"
	QUEUE_SPARE_TRIGGER: "4.5"
	# inference-scheduling guide has routing proxy disabled, so vLLM
	# serves directly on port 8000 (not 8200 behind proxy)
	VLLM_SVC_PORT: "8000"
	run: \|
	echo "Deploying WVA and llm-d infrastructure..."
	echo " MODEL_ID: $MODEL_ID"
	echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
	echo " LLMD_NS: $LLMD_NS"
	echo " WVA_NS: $WVA_NS"
	echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
	echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
	echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
	echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
	echo " DECODE_REPLICAS: $DECODE_REPLICAS"
	echo " KV_SPARE_TRIGGER: ${KV_SPARE_TRIGGER:-<default>}"
	echo " QUEUE_SPARE_TRIGGER: ${QUEUE_SPARE_TRIGGER:-<default>}"
	echo " HF token configuration: ✓"
	./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift

	- name: Create secondary namespace for Model B
	run: \|
	echo "Creating secondary namespace for Model B..."
	kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml \| kubectl apply -f -
	echo "Secondary namespace $LLMD_NAMESPACE_B created"

	- name: Label namespaces for OpenShift monitoring
	run: \|
	echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..."
	kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
	kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite
	kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
	echo "Namespace labels applied"

	- name: Wait for infrastructure to be ready
	run: \|
	echo "Waiting for WVA controller to be ready..."
	kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s \|\| true
	kubectl get pods -n "$WVA_NAMESPACE"

	# Ensure the vLLM deployment has the correct replica count.
	# A previous failed run's "Scale down GPU workloads" step may have set replicas=0
	# and helmfile doesn't override manually-changed replicas on re-deploy.
	# kubectl rollout status returns instantly on 0-replica deployments, so we must
	# ensure replicas > 0 before waiting.
	DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
	CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null \|\| echo "0")
	if [ "$CURRENT_REPLICAS" -eq 0 ]; then
	echo "WARNING: Model A1 deployment has 0 replicas (likely from previous failed run cleanup)"
	echo "Scaling to $DESIRED_REPLICAS replica(s)..."
	kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --replicas="$DESIRED_REPLICAS" \|\| {
	echo "ERROR: Failed to scale Model A1 deployment"
	exit 1
	}
	fi

	echo "Waiting for Model A1 vLLM deployment to be ready (up to 25 minutes for model loading)..."
	# kubectl rollout status waits for all replicas to be Ready, unlike
	# --for=condition=available which is satisfied even at 0 ready replicas.
	# vLLM model loading takes 15-20 minutes, so we use a 25-minute timeout.
	kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s \|\| {
	echo "WARNING: Model A1 deployment not ready after 25 minutes"
	echo "=== Pod status ==="
	kubectl get pods -n "$LLMD_NAMESPACE"
	echo "=== Deployment conditions ==="
	kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.status.conditions}' \| jq . \|\| true
	echo "=== Recent events ==="
	kubectl get events -n "$LLMD_NAMESPACE" --sort-by='.lastTimestamp' \| tail -20
	}
	kubectl get pods -n "$LLMD_NAMESPACE"

	- name: Deploy Model B infrastructure in secondary namespace
	env:
	# HF_TOKEN is inherited from GITHUB_ENV
	ENVIRONMENT: openshift
	INSTALL_GATEWAY_CTRLPLANE: "false"
	E2E_TESTS_ENABLED: "true"
	SCALER_BACKEND: keda
	NAMESPACE_SCOPED: "false"
	# Override namespaces for Model B stack
	LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
	WVA_NS: ${{ env.WVA_NAMESPACE }}
	# Skip WVA controller and prometheus (use existing)
	DEPLOY_WVA: "false"
	DEPLOY_PROMETHEUS: "false"
	DEPLOY_PROMETHEUS_ADAPTER: "false"
	DEPLOY_VA: "false"
	DEPLOY_HPA: "false"
	# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
	VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
	# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
	DECODE_REPLICAS: "1"
	# OpenShift monitoring settings (same as Model A1 deploy)
	MONITORING_NAMESPACE: openshift-user-workload-monitoring
	WVA_METRICS_SECURE: "false"
	# Same port as Model A1 (inference-scheduling guide, proxy disabled)
	VLLM_SVC_PORT: "8000"
	run: \|
	echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
	echo " MODEL_ID: $MODEL_ID"
	echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
	echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
	echo " DECODE_REPLICAS: $DECODE_REPLICAS"

	# Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
	./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift

	echo "Waiting for Model B deployment to start (initial rollout)..."
	# Wait briefly for deployments to be created by helm before checking rollout status
	sleep 10
	kubectl get pods -n "$LLMD_NAMESPACE_B"

	- name: Deploy Model B WVA resources
	env:
	LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
	WVA_NS: ${{ env.WVA_NAMESPACE }}
	# Use same controller instance as Model A for HPA selector matching
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	run: \|
	echo "Deploying Model B WVA resources..."
	echo " Release name: $MODEL_B_RELEASE"
	echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"

	# Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B
	# controller.enabled=false since we're using the existing WVA controller
	# Note: llmd.modelName should be base name without -decode suffix (template appends it)
	helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \
	-n "$WVA_NAMESPACE" \
	--set controller.enabled=false \
	--set va.enabled=true \
	--set hpa.enabled=true \
	--set hpa.behavior.scaleUp.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
	--set hpa.behavior.scaleDown.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
	--set llmd.namespace="$LLMD_NAMESPACE_B" \
	--set llmd.modelName="ms-inference-scheduling-llm-d-modelservice" \
	--set llmd.modelID="$MODEL_ID" \
	--set va.accelerator="$ACCELERATOR_TYPE" \
	--set wva.baseName="inference-scheduling" \
	--set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring \
	--set wva.metrics.secure=false \
	--set vllmService.port=8000 \
	--set vllmService.targetPort=8000 \
	--set wva.controllerInstance="$CONTROLLER_INSTANCE"

	echo "Model B WVA resources deployed"
	kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" \|\| true
	kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" \|\| true

	- name: Wait for Model B to be ready
	run: \|
	# Same fix as Model A1: ensure replicas > 0 before waiting for rollout
	DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
	CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.spec.replicas}' 2>/dev/null \|\| echo "0")
	if [ "$CURRENT_REPLICAS" -eq 0 ]; then
	echo "WARNING: Model B deployment has 0 replicas (likely from previous failed run cleanup)"
	echo "Scaling to $DESIRED_REPLICAS replica(s)..."
	kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --replicas="$DESIRED_REPLICAS" \|\| {
	echo "ERROR: Failed to scale Model B deployment"
	exit 1
	}
	fi

	echo "Waiting for Model B vLLM deployment to be ready (up to 25 minutes for model loading)..."
	# Same as Model A1: use rollout status to wait for actual pod readiness.
	kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --timeout=1500s \|\| {
	echo "WARNING: Model B deployment not ready after 25 minutes"
	echo "=== Pod status ==="
	kubectl get pods -n "$LLMD_NAMESPACE_B"
	echo "=== Deployment conditions ==="
	kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.status.conditions}' \| jq . \|\| true
	echo "=== Recent events ==="
	kubectl get events -n "$LLMD_NAMESPACE_B" --sort-by='.lastTimestamp' \| tail -20
	}

	- name: Verify multi-model deployment
	run: \|
	echo "=== Multi-Model Deployment Status ==="
	echo ""
	echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ==="
	kubectl get deployment -n "$LLMD_NAMESPACE" \| grep -E "decode\|NAME" \|\| true
	kubectl get hpa -n "$LLMD_NAMESPACE" \|\| true
	kubectl get variantautoscaling -n "$LLMD_NAMESPACE" \|\| true
	echo ""
	echo "=== Model B ($LLMD_NAMESPACE_B) ==="
	kubectl get deployment -n "$LLMD_NAMESPACE_B" \| grep -E "decode\|NAME" \|\| true
	kubectl get hpa -n "$LLMD_NAMESPACE_B" \|\| true
	kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" \|\| true
	echo ""
	echo "=== WVA Controller ($WVA_NAMESPACE) ==="
	kubectl get pods -n "$WVA_NAMESPACE"

	- name: Verify metrics pipeline
	run: \|
	echo "=== Verifying metrics pipeline before running tests ==="
	echo ""

	# 1. Verify vLLM pods are serving /metrics endpoint
	echo "--- Step 1: Checking vLLM /metrics endpoint ---"
	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
	VLLM_POD=$(kubectl get pods -n "$ns" -l llm-d.ai/inference-serving=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null \|\| true)
	if [ -n "$VLLM_POD" ]; then
	PORT="${VLLM_SVC_PORT:-8000}"
	echo " Checking vLLM pod $VLLM_POD in $ns (port $PORT)..."
	METRICS=$(kubectl exec -n "$ns" "$VLLM_POD" -- curl -s "http://localhost:${PORT}/metrics" 2>/dev/null \| head -5 \|\| true)
	if [ -n "$METRICS" ]; then
	echo " ✅ vLLM metrics endpoint responding in $ns"
	else
	echo " ⚠️ vLLM metrics endpoint not responding in $ns (may still be loading)"
	fi
	# Show pod labels for debugging
	echo " Pod labels:"
	kubectl get pod "$VLLM_POD" -n "$ns" -o jsonpath='{.metadata.labels}' \| jq -r 'to_entries[] \| " \(.key)=\(.value)"' 2>/dev/null \|\| true
	else
	echo " ⚠️ No vLLM pods found with label llm-d.ai/inference-serving=true in $ns"
	echo " All pods in $ns:"
	kubectl get pods -n "$ns" --show-labels 2>/dev/null \|\| true
	fi
	done

	# 1b. Verify vllm-service has endpoints (critical for ServiceMonitor scraping)
	echo ""
	echo "--- Step 1b: Checking vllm-service endpoints ---"
	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
	SVC_NAME=$(kubectl get svc -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null \|\| true)
	if [ -n "$SVC_NAME" ]; then
	ENDPOINTS=$(kubectl get endpoints "$SVC_NAME" -n "$ns" -o jsonpath='{.subsets[].addresses[].ip}' 2>/dev/null \|\| true)
	if [ -n "$ENDPOINTS" ]; then
	echo " ✅ Service $SVC_NAME in $ns has endpoints: $ENDPOINTS"
	else
	echo " ❌ Service $SVC_NAME in $ns has NO endpoints — label selector mismatch!"
	echo " Service selector:"
	kubectl get svc "$SVC_NAME" -n "$ns" -o jsonpath='{.spec.selector}' 2>/dev/null \| jq . \|\| true
	fi
	else
	echo " ⚠️ No vllm-service found in $ns"
	fi
	done

	# 1c. Check PodMonitors (llm-d guide deploys these for direct pod scraping)
	echo ""
	echo "--- Step 1c: PodMonitor configuration ---"
	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
	PM_COUNT=$(kubectl get podmonitor -n "$ns" --no-headers 2>/dev/null \| wc -l \| tr -d ' ')
	echo " PodMonitors in $ns: $PM_COUNT"
	kubectl get podmonitor -n "$ns" 2>/dev/null \|\| true
	done

	# 2. Check WVA controller health
	echo ""
	echo "--- Step 2: WVA controller status ---"
	kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler
	WVA_POD=$(kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null \|\| true)
	if [ -n "$WVA_POD" ]; then
	echo " Recent WVA controller logs:"
	kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=20 \| grep -E "reconcil\|metrics\|error\|saturation" \|\| echo " (no matching log lines)"
	fi

	# 3. Check VariantAutoscaling status
	echo ""
	echo "--- Step 3: VariantAutoscaling status ---"
	kubectl get variantautoscaling -A -o wide 2>/dev/null \|\| echo " No VariantAutoscalings found"

	# 4. Check ServiceMonitors exist
	echo ""
	echo "--- Step 4: ServiceMonitor configuration ---"
	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
	SM_COUNT=$(kubectl get servicemonitor -n "$ns" --no-headers 2>/dev/null \| wc -l \| tr -d ' ')
	echo " ServiceMonitors in $ns: $SM_COUNT"
	kubectl get servicemonitor -n "$ns" 2>/dev/null \|\| true
	done

	# 5. Wait for WVA to start processing metrics (up to 3 minutes)
	echo ""
	echo "--- Step 5: Waiting for WVA to detect metrics (up to 3 minutes) ---"
	METRICS_READY=false
	for i in $(seq 1 18); do
	VA_STATUS=$(kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -o jsonpath='{.items[0].status.desiredOptimizedAlloc.accelerator}' 2>/dev/null \|\| true)
	if [ -n "$VA_STATUS" ]; then
	echo " ✅ WVA optimization active — accelerator: $VA_STATUS"
	METRICS_READY=true
	break
	fi
	echo " Attempt $i/18: WVA not yet optimizing, waiting 10s..."
	sleep 10
	done

	if [ "$METRICS_READY" = "false" ]; then
	echo " ⚠️ WVA has not started optimizing after 3 minutes"
	echo " This may cause test timeouts — dumping diagnostics:"
	echo ""
	echo " === WVA controller logs (last 50 lines) ==="
	kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=50 2>/dev/null \|\| true
	echo ""
	echo " === HPA status ==="
	kubectl get hpa -A 2>/dev/null \|\| true
	echo ""
	echo " Continuing to tests anyway (they have their own timeouts)..."
	fi

	echo ""
	echo "=== Metrics pipeline verification complete ==="

	- name: Install Go dependencies
	run: \|
	GOTOOLCHAIN=auto go version
	GOTOOLCHAIN=auto go env GOTOOLCHAIN
	GOTOOLCHAIN=auto go mod download

	- name: Run OpenShift E2E tests
	env:
	# Consolidated e2e test environment variables
	ENVIRONMENT: openshift
	USE_SIMULATOR: "true"
	SCALE_TO_ZERO_ENABLED: "true"
	WVA_NAMESPACE: ${{ env.WVA_NAMESPACE }}
	MONITORING_NAMESPACE: openshift-user-workload-monitoring
	LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
	# Legacy variables for backward compatibility (if needed by tests)
	CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
	# Multi-model testing: secondary namespace for Model B
	LLMD_NAMESPACE_B: ${{ env.LLMD_NAMESPACE_B }}
	GATEWAY_NAME: infra-inference-scheduling-inference-gateway-istio
	DEPLOYMENT: ms-inference-scheduling-llm-d-modelservice-decode
	# Pass WVA_RELEASE_NAME so test can filter for current run's resources
	WVA_RELEASE_NAME: ${{ env.WVA_RELEASE_NAME }}
	# Controller instance label must match what the controller was deployed with
	CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
	MODEL_ID: ${{ env.MODEL_ID }}
	REQUEST_RATE: ${{ env.REQUEST_RATE }}
	NUM_PROMPTS: ${{ env.NUM_PROMPTS }}
	run: \|
	echo "Running consolidated E2E tests on OpenShift with configuration:"
	echo " ENVIRONMENT: $ENVIRONMENT"
	echo " USE_SIMULATOR: $USE_SIMULATOR"
	echo " SCALE_TO_ZERO_ENABLED: $SCALE_TO_ZERO_ENABLED"
	echo " WVA_NAMESPACE: $WVA_NAMESPACE"
	echo " MONITORING_NAMESPACE: $MONITORING_NAMESPACE"
	echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
	echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
	echo " DEPLOYMENT: $DEPLOYMENT"
	echo " GATEWAY_NAME: $GATEWAY_NAME"
	echo " MODEL_ID: $MODEL_ID"
	echo " REQUEST_RATE: $REQUEST_RATE"
	echo " NUM_PROMPTS: $NUM_PROMPTS"
	echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
	make test-e2e-full

	- name: Cleanup infrastructure
	# Cleanup on success or cancellation, but NOT on failure (preserve for debugging)
	# Use SKIP_CLEANUP=true to keep resources after successful runs
	if: (success() \|\| cancelled()) && env.SKIP_CLEANUP != 'true'
	run: \|
	echo "Cleaning up ALL test infrastructure..."
	echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
	echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
	echo " WVA_NAMESPACE: $WVA_NAMESPACE"
	echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
	echo " MODEL_B_RELEASE: $MODEL_B_RELEASE"

	# Uninstall all WVA helm releases before deleting namespaces
	# This ensures proper cleanup of resources and removes helm tracking
	echo "Uninstalling WVA helm releases..."
	helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true

	echo "Uninstalling llm-d helm releases in primary namespace..."
	for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
	echo " Uninstalling release: $release"
	helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	done

	echo "Uninstalling llm-d helm releases in secondary namespace..."
	for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do
	echo " Uninstalling release: $release"
	helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s \|\| true
	done

	# Delete all PR-specific namespaces
	echo "Deleting llm-d namespace $LLMD_NAMESPACE..."
	kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s \|\| true

	echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..."
	kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s \|\| true

	echo "Deleting WVA namespace $WVA_NAMESPACE..."
	kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s \|\| true

	# Clean up cluster-scoped WVA resources for THIS release only
	# Use both name and instance labels to avoid deleting resources from other PRs
	echo "Removing cluster-scoped WVA resources for release $WVA_RELEASE_NAME..."
	kubectl delete clusterrole,clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler,app.kubernetes.io/instance="$WVA_RELEASE_NAME" --ignore-not-found \|\| true

	# Also clean up cluster-scoped resources owned by this PR's namespaces
	# (covers helmfile-created resources whose instance label differs from WVA_RELEASE_NAME)
	for kind in clusterrole clusterrolebinding; do
	kubectl get "$kind" -o json 2>/dev/null \| \
	jq -r '.items[] \| select(.metadata.name \| contains("workload-variant-autoscaler")) \| "\(.metadata.name)\t\(.metadata.annotations["meta.helm.sh/release-namespace"] // "")"' 2>/dev/null \| \
	while IFS=$'\t' read -r name ns; do
	if [ "$ns" = "$LLMD_NAMESPACE" ] \|\| [ "$ns" = "$LLMD_NAMESPACE_B" ] \|\| [ "$ns" = "$WVA_NAMESPACE" ]; then
	echo " Deleting $kind/$name (owned by PR namespace '$ns')"
	kubectl delete "$kind" "$name" --ignore-not-found \|\| true
	fi
	done
	done

	echo "Cleanup complete"

	- name: Dump cluster state
	if: always()
	run: \|
	echo "=== Dumping cluster state for diagnostics ==="
	echo ""
	echo "=== VAs ==="
	kubectl get va -n "$LLMD_NAMESPACE" 2>/dev/null \|\| true
	kubectl get va -n "$LLMD_NAMESPACE_B" 2>/dev/null \|\| true
	echo ""
	echo "=== HPAs ==="
	kubectl get hpa -n "$LLMD_NAMESPACE" 2>/dev/null \|\| true
	kubectl get hpa -n "$LLMD_NAMESPACE_B" 2>/dev/null \|\| true
	echo ""
	echo "=== Controller pods ==="
	kubectl get pods -n "$WVA_NAMESPACE" 2>/dev/null \|\| true
	echo ""
	echo "=== All resources ==="
	for ns in "$WVA_NAMESPACE" "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
	if kubectl get namespace "$ns" &>/dev/null; then
	echo "--- Namespace: $ns ---"
	kubectl get all -n "$ns" 2>/dev/null \|\| true
	echo ""
	echo "--- Events in $ns ---"
	kubectl get events -n "$ns" --sort-by='.lastTimestamp' 2>/dev/null \| tail -20 \|\| true
	echo ""
	fi
	done

	- name: Scale down GPU workloads on failure
	# On failure, scale down decode deployments to free GPUs while preserving
	# other resources (VA, HPA, controller, gateway) for debugging
	if: failure()
	run: \|
	echo "Test failed - scaling down decode deployments to free GPUs..."
	echo "Other resources (VA, HPA, controller logs) are preserved for debugging"
	echo ""

	for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
	if kubectl get namespace "$ns" &>/dev/null; then
	echo "=== Scaling down decode deployments in $ns ==="
	kubectl scale deployment -n "$ns" -l llm-d.ai/inferenceServing=true --replicas=0 \|\| true
	# Also try by name pattern in case labels are missing
	kubectl get deployment -n "$ns" -o name 2>/dev/null \| grep decode \| while read -r deploy; do
	echo " Scaling down: $deploy"
	kubectl scale "$deploy" -n "$ns" --replicas=0 \|\| true
	done
	fi
	done

	# Report status back to PR for issue_comment triggered runs
	# This ensures fork PRs show the correct status after /ok-to-test runs complete
	report-status:
	runs-on: ubuntu-latest
	needs: [gate, e2e-openshift]
	# Run always (even on failure) but only for issue_comment events
	if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true'
	steps:
	- name: Report status to PR
	uses: actions/github-script@v7
	with:
	script: \|
	const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
	const e2eResult = '${{ needs.e2e-openshift.result }}';

	// Map job result to commit status
	let state, description;
	if (e2eResult === 'success') {
	state = 'success';
	description = 'E2E tests passed';
	} else if (e2eResult === 'skipped') {
	state = 'pending';
	description = 'E2E tests skipped';
	} else if (e2eResult === 'cancelled') {
	state = 'failure';
	description = 'E2E tests cancelled';
	} else {
	state = 'failure';
	description = 'E2E tests failed';
	}

	console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);

	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: prHeadSha,
	state: state,
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: description,
	context: '${{ github.workflow }} / e2e (comment trigger)'
	});

	console.log('Status reported successfully');

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add optimization loop performance metrics #2954

Workflow file

Add optimization loop performance metrics #2954

Uh oh!

Workflow file for this run