Skip to content

Add optimization loop performance metrics #2954

Add optimization loop performance metrics

Add optimization loop performance metrics #2954

name: CI - OpenShift E2E Tests
# Permissions needed for various jobs
permissions:
contents: read
packages: write
pull-requests: write # For posting comments on PRs
statuses: write # For reporting status on fork PR commits
# Cancel previous runs on the same PR to avoid resource conflicts
# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments)
# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
#
# Logic:
# - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests
# - Valid triggers: group 'e2e-openshift-{pr_number}' (can cancel previous runs for same PR)
# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
#
# NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation (line ~125)
concurrency:
group: >-
${{
github.event_name == 'issue_comment' &&
!contains(github.event.comment.body, '/ok-to-test') &&
!contains(github.event.comment.body, '/retest')
&& format('comment-isolated-{0}', github.run_id)
|| format('e2e-openshift-{0}',
github.event.pull_request.number
|| github.event.issue.number
|| github.run_id)
}}
cancel-in-progress: true
on:
pull_request:
branches:
- main
- dev
# Allow maintainers to trigger tests on fork PRs via /ok-to-test comment
issue_comment:
types: [created]
workflow_dispatch:
inputs:
model_id:
description: 'Model ID'
required: false
default: 'unsloth/Meta-Llama-3.1-8B'
accelerator_type:
description: 'Accelerator type (H100, A100, L40S)'
required: false
default: 'H100'
request_rate:
description: 'Request rate (req/s)'
required: false
default: '20'
num_prompts:
description: 'Number of prompts'
required: false
default: '3000'
skip_cleanup:
description: 'Skip cleanup after tests'
required: false
default: 'false'
max_num_seqs:
description: 'vLLM max batch size (lower = easier to saturate)'
required: false
default: '1'
hpa_stabilization_seconds:
description: 'HPA stabilization window in seconds'
required: false
default: '240'
jobs:
# Check if PR contains code changes (not just docs/metadata)
check-code-changes:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
has_code_changes: ${{ steps.set-output.outputs.has_code_changes }}
steps:
- name: Checkout source
if: github.event_name == 'pull_request'
uses: actions/checkout@v4
- name: Check for code changes
if: github.event_name == 'pull_request'
uses: dorny/paths-filter@v3
id: filter
with:
filters: |
code:
- '!docs/**'
- '!README.md'
- '!CONTRIBUTING.md'
- '!LICENSE'
- '!OWNERS'
- '!PROJECT'
- name: Set output
id: set-output
run: |
if [ "${{ github.event_name }}" != "pull_request" ]; then
# Always run for issue_comment (/ok-to-test, /retest) and workflow_dispatch
echo "has_code_changes=true" >> $GITHUB_OUTPUT
elif [ -n "${{ steps.filter.outputs.code }}" ]; then
echo "has_code_changes=${{ steps.filter.outputs.code }}" >> $GITHUB_OUTPUT
else
echo "has_code_changes=true" >> $GITHUB_OUTPUT
fi
# Gate: Check permissions and handle /ok-to-test for fork PRs.
# - Maintainers (write access): Tests run automatically on pull_request.
# - Fork PRs: Gate succeeds (no failure) so the PR does not show a false red check; E2E runs
# only after a maintainer comments /ok-to-test. Branch protection should require the
# "e2e-openshift" job so merge stays blocked until that run passes.
gate:
needs: check-code-changes
if: needs.check-code-changes.outputs.has_code_changes == 'true'
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.check.outputs.should_run }}
pr_number: ${{ steps.check.outputs.pr_number }}
pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
is_fork_pr: ${{ steps.check.outputs.is_fork_pr }}
steps:
- name: Check permissions and OpenShift E2E triggers (/ok-to-test, /retest)
id: check
uses: actions/github-script@v7
with:
script: |
// Helper to check if user has write access
async function hasWriteAccess(username) {
try {
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: username
});
const privilegedRoles = ['admin', 'maintain', 'write'];
return privilegedRoles.includes(permission.permission);
} catch (e) {
console.log(`Could not get permissions for ${username}: ${e.message}`);
return false;
}
}
// Always run for workflow_dispatch
if (context.eventName === 'workflow_dispatch') {
core.setOutput('should_run', 'true');
core.setOutput('pr_number', '');
core.setOutput('pr_head_sha', context.sha);
core.setOutput('is_fork_pr', 'false');
return;
}
// Handle issue_comment event (/ok-to-test or /retest)
if (context.eventName === 'issue_comment') {
const comment = context.payload.comment.body.trim();
const issue = context.payload.issue;
// Only process /ok-to-test or /retest comments on PRs
if (!issue.pull_request) {
console.log('Comment is not on a PR, skipping');
core.setOutput('should_run', 'false');
return;
}
// NOTE: This list must stay in sync with concurrency group logic (lines 23-25)
const validCommands = ['/ok-to-test', '/retest'];
if (!validCommands.includes(comment)) {
console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
core.setOutput('should_run', 'false');
return;
}
// Check if commenter has write access
const commenter = context.payload.comment.user.login;
const hasAccess = await hasWriteAccess(commenter);
if (!hasAccess) {
console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
core.setOutput('should_run', 'false');
return;
}
// Get PR details to get head SHA
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: issue.number
});
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
console.log(`PR head SHA: ${pr.head.sha}`);
console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('should_run', 'true');
core.setOutput('pr_number', issue.number.toString());
core.setOutput('pr_head_sha', pr.head.sha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Add reaction to acknowledge
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'rocket'
});
// Post comment with link to the e2e workflow run
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const cmdDesc = comment === '/ok-to-test' ? 'approve and run' : 're-run';
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issue.number,
body: `🚀 **OpenShift E2E** — ${cmdDesc} (\`${comment}\`)\n\n[View the OpenShift E2E workflow run](${runUrl})`
});
return;
}
// Handle pull_request event
const pr = context.payload.pull_request;
const prAuthor = pr.user.login;
const prNumber = pr.number;
const prHeadSha = pr.head.sha;
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('pr_number', prNumber.toString());
core.setOutput('pr_head_sha', prHeadSha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Check if PR author has write access
const isPrivileged = await hasWriteAccess(prAuthor);
console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`);
// Check if we already posted a bot comment
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber
});
const botComment = comments.data.find(c =>
c.user.type === 'Bot' &&
c.body.includes('ok-to-test')
);
// Helper to safely post a comment (may fail on fork PRs due to permissions)
async function tryPostComment(body) {
try {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
return true;
} catch (e) {
// Fork PRs can't post comments on pull_request event (GitHub security restriction)
console.log(`Could not post comment (expected for fork PRs): ${e.message}`);
return false;
}
}
if (isPrivileged) {
// For maintainer/admin fork PRs, we need to trigger via /ok-to-test
// because fork PRs don't have access to secrets on pull_request event
if (isFork) {
console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`);
core.setOutput('should_run', 'false'); // Don't run on pull_request event
// Auto-post /ok-to-test to trigger issue_comment workflow
if (!botComment) {
const posted = await tryPostComment(`/ok-to-test`);
if (!posted) {
console.log('Note: Maintainer will need to manually comment /ok-to-test');
}
}
// Do not fail the gate: fork PRs cannot run E2E on pull_request (no secrets).
// Gate succeeds so the PR does not show a false failure; branch protection
// should require "e2e-openshift" so merge stays blocked until /ok-to-test run passes.
return;
}
// Non-fork PR from maintainer - run directly
core.setOutput('should_run', 'true');
return;
}
// External contributor - post instructions and skip
console.log('External contributor PR - posting instructions');
core.setOutput('should_run', 'false');
if (!botComment) {
const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so **OpenShift E2E** (GPU) tests require approval to run.\n\n**For maintainers/admins:** Comment \`/ok-to-test\` to approve and trigger **OpenShift E2E** on this PR, or \`/retest\` to re-run OpenShift E2E (e.g. after a failure or new commits).\n\n**For contributors:** Please wait for a maintainer or admin to approve running the tests.`);
if (!posted) {
console.log('Note: Could not post instructions comment on fork PR');
}
}
// Do not fail the gate: GitHub does not allow updating status from upstream on fork
// PRs, so a failed gate would stay red even after /ok-to-test run passes. Let the gate
// succeed; branch protection should require "e2e-openshift" so merge stays blocked
// until a maintainer comments /ok-to-test and E2E passes.
- name: Write workflow summary
if: always()
uses: actions/github-script@v7
with:
script: |
const shouldRun = '${{ steps.check.outputs.should_run }}';
const isFork = '${{ steps.check.outputs.is_fork_pr }}';
const eventName = '${{ github.event_name }}';
if (shouldRun === 'true') {
core.summary.addRaw('✅ **E2E tests will run** for this trigger.\n').write();
} else if (isFork === 'true' && eventName === 'pull_request') {
core.summary.addRaw([
'⏸️ **E2E tests skipped — fork PR**\n\n',
'Fork PRs cannot run E2E on `pull_request` events (no access to secrets/GPU runners).\n\n',
'A maintainer must comment \`/ok-to-test\` to trigger the **OpenShift E2E** suite. ',
'Branch protection should require **e2e-openshift** so merge stays blocked until E2E passes.\n',
].join('')).write();
} else {
core.summary.addRaw('⏸️ **E2E tests were skipped** (gate check did not pass for this trigger).\n').write();
}
# Build the WVA controller image on GitHub-hosted runner (has proper Docker setup)
# Note: Skip for fork PRs on pull_request event (no secrets access).
# For fork PRs, build-image runs via issue_comment trigger (/ok-to-test).
build-image:
needs: gate
if: |
needs.gate.outputs.should_run == 'true' &&
(needs.gate.outputs.is_fork_pr != 'true' || github.event_name != 'pull_request')
runs-on: ubuntu-latest
outputs:
image_tag: ${{ steps.build.outputs.image_tag }}
steps:
- name: Checkout source
uses: actions/checkout@v4
with:
# Use PR head SHA from gate (works for both pull_request and issue_comment)
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ secrets.CR_USER }}
password: ${{ secrets.CR_TOKEN }}
- name: Build and push image
id: build
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
# Use PR head SHA from gate
GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
run: |
# Build image with git ref tag for this PR
# Use first 8 chars of the git ref (POSIX-compliant)
IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" | cut -c1-8)"
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
echo "Building image: $FULL_IMAGE"
echo "Git ref: $GIT_REF"
# Build and push using make targets
make docker-build IMG="$FULL_IMAGE"
make docker-push IMG="$FULL_IMAGE"
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "Image built and pushed: $FULL_IMAGE"
# Run e2e tests on OpenShift self-hosted runner (vllm-d cluster).
# pok-prod runners are reserved for nightly E2E only.
e2e-openshift:
runs-on: [self-hosted, openshift, vllm-d]
needs: [gate, build-image]
if: needs.gate.outputs.should_run == 'true'
env:
MODEL_ID: ${{ github.event.inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }}
GOTOOLCHAIN: auto
ACCELERATOR_TYPE: ${{ github.event.inputs.accelerator_type || 'A100' }}
REQUEST_RATE: ${{ github.event.inputs.request_rate || '20' }}
NUM_PROMPTS: ${{ github.event.inputs.num_prompts || '3000' }}
MAX_NUM_SEQS: ${{ github.event.inputs.max_num_seqs || '5' }}
HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '240' }}
SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
# Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support)
LLM_D_RELEASE: main
# PR-specific namespaces for isolation between concurrent PR tests
# Primary llm-d namespace (Model A1 + A2)
LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Secondary llm-d namespace (Model B)
LLMD_NAMESPACE_B: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}-b
# WVA controller namespace (monitors all models)
WVA_NAMESPACE: llm-d-autoscaler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Unique release names per run to avoid conflicts
WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }}
# Model A1: Primary deployment in LLMD_NAMESPACE
MODEL_A1_RELEASE: model-a1-${{ github.run_id }}
# Model B: Deployment in LLMD_NAMESPACE_B
MODEL_B_RELEASE: model-b-${{ github.run_id }}
# Use the image built in the previous job
WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
steps:
- name: Checkout source
uses: actions/checkout@v4
with:
# Use PR head SHA from gate (works for both pull_request and issue_comment)
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version: "1.25.x"
cache-dependency-path: ./go.sum
- name: Verify Go toolchain
run: |
which go
go version
go env GOTOOLCHAIN
- name: Install tools (kubectl, oc, helm, make)
run: |
sudo apt-get update && sudo apt-get install -y make
# Install kubectl - use pinned version for reproducible CI builds
# Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+
# Update this version when upgrading target cluster or during regular dependency reviews
KUBECTL_VERSION="v1.31.0"
echo "Installing kubectl version: $KUBECTL_VERSION"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256"
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
rm -f kubectl.sha256
# Install oc (OpenShift CLI)
curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
tar -xzf openshift-client-linux.tar.gz
sudo mv oc /usr/local/bin/
rm -f openshift-client-linux.tar.gz kubectl README.md
# Install helm
curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: Verify cluster access
run: |
echo "Verifying cluster access..."
kubectl cluster-info
kubectl get nodes
- name: Verify correct cluster (vllm-d, not pok-prod)
run: |
# PR E2E tests must run on the vllm-d cluster, not pok-prod-sa.
# pok-prod-sa is reserved for nightly E2E runs only.
# Runners with the 'pok-prod' label connect to pok-prod-sa;
# runners without it connect to vllm-d.
CLUSTER_API=$(kubectl cluster-info 2>/dev/null | head -1 | grep -oE 'https://[^ ]+')
echo "Cluster API: $CLUSTER_API"
if echo "$CLUSTER_API" | grep -q "pokprod"; then
echo "::error::This runner is connected to pok-prod-sa, but PR E2E tests must run on vllm-d."
echo "::error::The runner likely has the 'pok-prod' label. PR CI should only use vllm-d runners."
exit 1
fi
echo "Cluster verified: running on vllm-d"
- name: Check GPU availability
id: gpu-check
run: |
echo "Checking GPU availability for e2e test..."
# Minimum GPUs needed: 2 models × 2 GPUs each = 4
# Recommended with scale-up headroom: 6
REQUIRED_GPUS=4
RECOMMENDED_GPUS=6
# Total allocatable GPUs across all nodes
TOTAL_GPUS=$(kubectl get nodes -o json | \
jq '[.items[].status.allocatable["nvidia.com/gpu"] // "0" | tonumber] | add // 0')
# Currently requested GPUs by all pods
ALLOCATED_GPUS=$(kubectl get pods --all-namespaces -o json | \
jq '[.items[] | select(.status.phase == "Running" or .status.phase == "Pending") | .spec.containers[]?.resources.requests["nvidia.com/gpu"] // "0" | tonumber] | add // 0')
AVAILABLE_GPUS=$((TOTAL_GPUS - ALLOCATED_GPUS))
# Total allocatable CPU (cores) and memory (Gi) across all nodes
# CPU may be in millicores (e.g. "8000m") or cores (e.g. "8")
TOTAL_CPU=$(kubectl get nodes -o json | \
jq '[.items[].status.allocatable.cpu // "0" | if endswith("m") then (gsub("m$";"") | tonumber / 1000) else tonumber end] | add | floor')
TOTAL_MEM_KI=$(kubectl get nodes -o json | \
jq '[.items[].status.allocatable.memory // "0" | gsub("[^0-9]";"") | tonumber] | add')
TOTAL_MEM_GI=$((TOTAL_MEM_KI / 1048576))
NODE_COUNT=$(kubectl get nodes --no-headers | wc -l | tr -d ' ')
GPU_NODE_COUNT=$(kubectl get nodes -o json | \
jq '[.items[] | select((.status.allocatable["nvidia.com/gpu"] // "0" | tonumber) > 0)] | length')
# Export all values for the PR comment step
echo "total_gpus=$TOTAL_GPUS" >> $GITHUB_OUTPUT
echo "allocated_gpus=$ALLOCATED_GPUS" >> $GITHUB_OUTPUT
echo "available_gpus=$AVAILABLE_GPUS" >> $GITHUB_OUTPUT
echo "total_cpu=$TOTAL_CPU" >> $GITHUB_OUTPUT
echo "total_mem_gi=$TOTAL_MEM_GI" >> $GITHUB_OUTPUT
echo "node_count=$NODE_COUNT" >> $GITHUB_OUTPUT
echo "gpu_node_count=$GPU_NODE_COUNT" >> $GITHUB_OUTPUT
echo "required_gpus=$REQUIRED_GPUS" >> $GITHUB_OUTPUT
echo "recommended_gpus=$RECOMMENDED_GPUS" >> $GITHUB_OUTPUT
echo "## GPU Status" >> $GITHUB_STEP_SUMMARY
echo "| Metric | Count |" >> $GITHUB_STEP_SUMMARY
echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY
echo "| Total cluster GPUs | $TOTAL_GPUS |" >> $GITHUB_STEP_SUMMARY
echo "| Currently allocated | $ALLOCATED_GPUS |" >> $GITHUB_STEP_SUMMARY
echo "| Available | $AVAILABLE_GPUS |" >> $GITHUB_STEP_SUMMARY
echo "| Required (minimum) | $REQUIRED_GPUS |" >> $GITHUB_STEP_SUMMARY
echo "| Recommended (with scale-up) | $RECOMMENDED_GPUS |" >> $GITHUB_STEP_SUMMARY
if [ "$AVAILABLE_GPUS" -lt "$REQUIRED_GPUS" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "❌ **Insufficient GPUs** — need $REQUIRED_GPUS but only $AVAILABLE_GPUS available. Re-run when GPUs free up." >> $GITHUB_STEP_SUMMARY
echo "::error::Insufficient GPUs: need $REQUIRED_GPUS, have $AVAILABLE_GPUS available. Try again later."
echo "gpu_available=false" >> $GITHUB_OUTPUT
exit 1
elif [ "$AVAILABLE_GPUS" -lt "$RECOMMENDED_GPUS" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "⚠️ **Low GPU headroom** — $AVAILABLE_GPUS available (need $RECOMMENDED_GPUS for scale-up tests). Tests may fail during scale-up." >> $GITHUB_STEP_SUMMARY
echo "::warning::Low GPU headroom: $AVAILABLE_GPUS available, $RECOMMENDED_GPUS recommended for scale-up tests"
echo "gpu_available=true" >> $GITHUB_OUTPUT
else
echo "" >> $GITHUB_STEP_SUMMARY
echo "✅ **GPUs available** — $AVAILABLE_GPUS GPUs free ($REQUIRED_GPUS required, $RECOMMENDED_GPUS recommended)" >> $GITHUB_STEP_SUMMARY
echo "gpu_available=true" >> $GITHUB_OUTPUT
fi
- name: Post GPU status to PR
if: always() && needs.gate.outputs.pr_number != ''
continue-on-error: true
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ needs.gate.outputs.pr_number }}
run: |
GPU_STATUS="${{ steps.gpu-check.outcome }}"
GPU_AVAIL="${{ steps.gpu-check.outputs.gpu_available }}"
TOTAL_GPUS="${{ steps.gpu-check.outputs.total_gpus }}"
ALLOCATED_GPUS="${{ steps.gpu-check.outputs.allocated_gpus }}"
AVAILABLE_GPUS="${{ steps.gpu-check.outputs.available_gpus }}"
TOTAL_CPU="${{ steps.gpu-check.outputs.total_cpu }}"
TOTAL_MEM_GI="${{ steps.gpu-check.outputs.total_mem_gi }}"
NODE_COUNT="${{ steps.gpu-check.outputs.node_count }}"
GPU_NODE_COUNT="${{ steps.gpu-check.outputs.gpu_node_count }}"
REQUIRED_GPUS="${{ steps.gpu-check.outputs.required_gpus }}"
RECOMMENDED_GPUS="${{ steps.gpu-check.outputs.recommended_gpus }}"
NL=$'\n'
TABLE="| Resource | Total | Allocated | Available |${NL}|----------|-------|-----------|----------|${NL}| GPUs | $TOTAL_GPUS | $ALLOCATED_GPUS | **$AVAILABLE_GPUS** |${NL}${NL}| Cluster | Value |${NL}|---------|-------|${NL}| Nodes | $NODE_COUNT ($GPU_NODE_COUNT with GPUs) |${NL}| Total CPU | ${TOTAL_CPU} cores |${NL}| Total Memory | ${TOTAL_MEM_GI} Gi |${NL}| GPUs required | $REQUIRED_GPUS (min) / $RECOMMENDED_GPUS (recommended) |"
if [ "$GPU_STATUS" = "failure" ]; then
HEADER="### GPU Pre-flight Check ❌"
MSG="**Insufficient GPUs** to run OpenShift E2E. Re-run with \`/retest\` (OpenShift E2E) when GPUs free up."
elif [ "$GPU_AVAIL" = "true" ]; then
HEADER="### GPU Pre-flight Check ✅"
MSG="GPUs are available for e2e-openshift tests. Proceeding with deployment."
else
HEADER="### GPU Pre-flight Check ⚠️"
MSG="Low GPU headroom — tests may fail during scale-up phases."
fi
BODY="${HEADER}${NL}${MSG}${NL}${NL}${TABLE}"
PAYLOAD=$(jq -n --arg body "$BODY" '{"body": $body}')
curl -s -X POST \
-H "Authorization: token $GH_TOKEN" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/issues/$PR_NUMBER/comments" \
-d "$PAYLOAD"
- name: Get HF token from cluster secret
id: hf-token
run: |
echo "Reading HF token from cluster secret llm-d-hf-token in default namespace..."
# The llm-d-hf-token secret exists in the default namespace on the cluster
# Check secret existence separately from key retrieval for better error messages
if ! kubectl get secret llm-d-hf-token -n default &>/dev/null; then
echo "::error::Secret 'llm-d-hf-token' not found in default namespace"
echo "::error::Please ensure the HF token secret exists on the cluster"
exit 1
fi
# Read the token and mask it in logs
HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' | base64 -d)
if [ -z "$HF_TOKEN" ]; then
echo "::error::Secret 'llm-d-hf-token' exists but 'HF_TOKEN' key is empty or missing"
exit 1
fi
# Mask the token in workflow logs
echo "::add-mask::$HF_TOKEN"
# Export for subsequent steps
echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV
echo "HF token retrieved successfully from cluster secret"
- name: Clean up resources for this PR
run: |
echo "Cleaning up WVA resources for this PR's namespaces only..."
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
# Only clean up the 3 namespaces associated with THIS PR
# Do NOT touch namespaces from other PRs to avoid race conditions
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
if kubectl get namespace "$ns" &>/dev/null; then
echo ""
echo "=== Cleaning up namespace: $ns ==="
# Delete WVA resources in this namespace
echo " Removing HPAs and VAs..."
kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
# Uninstall all helm releases in the namespace
for release in $(helm list -n "$ns" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s || true
done
echo " Deleting namespace: $ns"
kubectl delete namespace "$ns" --ignore-not-found --timeout=60s || true
else
echo "Namespace $ns does not exist, skipping cleanup"
fi
done
# Clean up legacy namespaces if they exist (these are not PR-specific)
for legacy_ns in llm-d-inference-scheduler workload-variant-autoscaler-system; do
if kubectl get namespace "$legacy_ns" &>/dev/null; then
echo ""
echo "=== Cleaning up legacy namespace: $legacy_ns ==="
# Uninstall all helm releases in the namespace first
for release in $(helm list -n "$legacy_ns" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$legacy_ns" --ignore-not-found --wait --timeout 60s || true
done
echo " Deleting namespace: $legacy_ns"
kubectl delete namespace "$legacy_ns" --ignore-not-found --timeout=60s || true
fi
done
# The helmfile uses a generic release name "workload-variant-autoscaler" which
# produces non-unique ClusterRole names. On shared clusters, these resources
# may be owned by another namespace's release, causing Helm ownership conflicts.
# Fix: adopt them for our namespace so helmfile can proceed. Post-cleanup will
# delete them, and the next user's helmfile run will recreate them fresh.
# Only adopt legacy helmfile-style names (release "workload-variant-autoscaler").
# PR-specific Helm releases use names like wva-e2e-<run_id>; those live in WVA_NAMESPACE.
# Re-annotating them to LLMD_NAMESPACE breaks Helm ownership and can leave the controller
# ServiceAccount bound to a wrong or unmanaged ClusterRole (cluster-wide list/watch denied).
echo "Adopting shared WVA cluster-scoped resources for namespace $LLMD_NAMESPACE..."
for kind in clusterrole clusterrolebinding; do
kubectl get "$kind" -o json 2>/dev/null | \
jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | select(.metadata.name | startswith("wva-e2e-") | not) | select(.metadata.annotations["meta.helm.sh/release-namespace"] != null) | .metadata.name' 2>/dev/null | \
while read -r name; do
current_ns=$(kubectl get "$kind" "$name" -o json 2>/dev/null | jq -r '.metadata.annotations["meta.helm.sh/release-namespace"] // ""')
if [ "$current_ns" != "$LLMD_NAMESPACE" ]; then
echo " Adopting $kind/$name (was owned by '$current_ns')"
kubectl annotate "$kind" "$name" \
"meta.helm.sh/release-name=workload-variant-autoscaler" \
"meta.helm.sh/release-namespace=$LLMD_NAMESPACE" \
--overwrite || true
fi
done
done
echo ""
echo "Cleanup complete for this PR's namespaces"
- name: Apply latest CRDs
run: |
echo "Applying latest VariantAutoscaling CRD..."
# Helm doesn't auto-update CRDs, so we need to apply them manually
# to ensure the cluster has the latest schema (including scaleTargetRef)
kubectl apply -f charts/workload-variant-autoscaler/crds/
- name: Deploy WVA and llm-d infrastructure
env:
# HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step)
ENVIRONMENT: openshift
INSTALL_GATEWAY_CTRLPLANE: "false"
E2E_TESTS_ENABLED: "true"
# OpenShift typically lacks HPAScaleToZero; e2e forces SCALE_TO_ZERO_ENABLED off for openshift
# (see test/e2e/config.go). KEDA ScaledObjects support minReplicas=0 for scale-from-zero tests.
SCALER_BACKEND: keda
NAMESPACE_SCOPED: "false"
# Pass PR-specific namespaces to install script
LLMD_NS: ${{ env.LLMD_NAMESPACE }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Controller instance label for multi-controller isolation in parallel e2e tests
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
# Skip infra VA/HPA — the smoke test creates its own VA+HPA targeting
# its own deployment. The infra VA adds a second idle pod to the
# saturation analysis group, diluting KV cache metrics and preventing
# scale-up from triggering.
DEPLOY_VA: "false"
DEPLOY_HPA: "false"
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
DECODE_REPLICAS: "1"
# OpenShift uses built-in user-workload monitoring, not a separate namespace
MONITORING_NAMESPACE: openshift-user-workload-monitoring
# Disable bearer token auth on WVA /metrics endpoint — OpenShift's
# user-workload-monitoring cannot authenticate with the controller-manager
# SA token. The endpoint is still only accessible within the cluster network.
WVA_METRICS_SECURE: "false"
# Lower saturation thresholds for simulator mode — the simulator's
# KV-cache and queue metrics are modest, so default thresholds
# (kvSpareTrigger=0.1, queueSpareTrigger=3) are too high to trigger
# scale-up reliably. These values trigger when kvUsage > 0.30 or
# queueLength > 0.5, which the simulator produces under load.
KV_SPARE_TRIGGER: "0.5"
QUEUE_SPARE_TRIGGER: "4.5"
# inference-scheduling guide has routing proxy disabled, so vLLM
# serves directly on port 8000 (not 8200 behind proxy)
VLLM_SVC_PORT: "8000"
run: |
echo "Deploying WVA and llm-d infrastructure..."
echo " MODEL_ID: $MODEL_ID"
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
echo " LLMD_NS: $LLMD_NS"
echo " WVA_NS: $WVA_NS"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
echo " KV_SPARE_TRIGGER: ${KV_SPARE_TRIGGER:-<default>}"
echo " QUEUE_SPARE_TRIGGER: ${QUEUE_SPARE_TRIGGER:-<default>}"
echo " HF token configuration: ✓"
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
- name: Create secondary namespace for Model B
run: |
echo "Creating secondary namespace for Model B..."
kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml | kubectl apply -f -
echo "Secondary namespace $LLMD_NAMESPACE_B created"
- name: Label namespaces for OpenShift monitoring
run: |
echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..."
kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite
kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
echo "Namespace labels applied"
- name: Wait for infrastructure to be ready
run: |
echo "Waiting for WVA controller to be ready..."
kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s || true
kubectl get pods -n "$WVA_NAMESPACE"
# Ensure the vLLM deployment has the correct replica count.
# A previous failed run's "Scale down GPU workloads" step may have set replicas=0
# and helmfile doesn't override manually-changed replicas on re-deploy.
# kubectl rollout status returns instantly on 0-replica deployments, so we must
# ensure replicas > 0 before waiting.
DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0")
if [ "$CURRENT_REPLICAS" -eq 0 ]; then
echo "WARNING: Model A1 deployment has 0 replicas (likely from previous failed run cleanup)"
echo "Scaling to $DESIRED_REPLICAS replica(s)..."
kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --replicas="$DESIRED_REPLICAS" || {
echo "ERROR: Failed to scale Model A1 deployment"
exit 1
}
fi
echo "Waiting for Model A1 vLLM deployment to be ready (up to 25 minutes for model loading)..."
# kubectl rollout status waits for all replicas to be Ready, unlike
# --for=condition=available which is satisfied even at 0 ready replicas.
# vLLM model loading takes 15-20 minutes, so we use a 25-minute timeout.
kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s || {
echo "WARNING: Model A1 deployment not ready after 25 minutes"
echo "=== Pod status ==="
kubectl get pods -n "$LLMD_NAMESPACE"
echo "=== Deployment conditions ==="
kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.status.conditions}' | jq . || true
echo "=== Recent events ==="
kubectl get events -n "$LLMD_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
}
kubectl get pods -n "$LLMD_NAMESPACE"
- name: Deploy Model B infrastructure in secondary namespace
env:
# HF_TOKEN is inherited from GITHUB_ENV
ENVIRONMENT: openshift
INSTALL_GATEWAY_CTRLPLANE: "false"
E2E_TESTS_ENABLED: "true"
SCALER_BACKEND: keda
NAMESPACE_SCOPED: "false"
# Override namespaces for Model B stack
LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Skip WVA controller and prometheus (use existing)
DEPLOY_WVA: "false"
DEPLOY_PROMETHEUS: "false"
DEPLOY_PROMETHEUS_ADAPTER: "false"
DEPLOY_VA: "false"
DEPLOY_HPA: "false"
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
DECODE_REPLICAS: "1"
# OpenShift monitoring settings (same as Model A1 deploy)
MONITORING_NAMESPACE: openshift-user-workload-monitoring
WVA_METRICS_SECURE: "false"
# Same port as Model A1 (inference-scheduling guide, proxy disabled)
VLLM_SVC_PORT: "8000"
run: |
echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
echo " MODEL_ID: $MODEL_ID"
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
# Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
echo "Waiting for Model B deployment to start (initial rollout)..."
# Wait briefly for deployments to be created by helm before checking rollout status
sleep 10
kubectl get pods -n "$LLMD_NAMESPACE_B"
- name: Deploy Model B WVA resources
env:
LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Use same controller instance as Model A for HPA selector matching
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
run: |
echo "Deploying Model B WVA resources..."
echo " Release name: $MODEL_B_RELEASE"
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
# Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B
# controller.enabled=false since we're using the existing WVA controller
# Note: llmd.modelName should be base name without -decode suffix (template appends it)
helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \
-n "$WVA_NAMESPACE" \
--set controller.enabled=false \
--set va.enabled=true \
--set hpa.enabled=true \
--set hpa.behavior.scaleUp.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
--set hpa.behavior.scaleDown.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
--set llmd.namespace="$LLMD_NAMESPACE_B" \
--set llmd.modelName="ms-inference-scheduling-llm-d-modelservice" \
--set llmd.modelID="$MODEL_ID" \
--set va.accelerator="$ACCELERATOR_TYPE" \
--set wva.baseName="inference-scheduling" \
--set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring \
--set wva.metrics.secure=false \
--set vllmService.port=8000 \
--set vllmService.targetPort=8000 \
--set wva.controllerInstance="$CONTROLLER_INSTANCE"
echo "Model B WVA resources deployed"
kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
- name: Wait for Model B to be ready
run: |
# Same fix as Model A1: ensure replicas > 0 before waiting for rollout
DESIRED_REPLICAS="${DECODE_REPLICAS:-1}"
CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0")
if [ "$CURRENT_REPLICAS" -eq 0 ]; then
echo "WARNING: Model B deployment has 0 replicas (likely from previous failed run cleanup)"
echo "Scaling to $DESIRED_REPLICAS replica(s)..."
kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --replicas="$DESIRED_REPLICAS" || {
echo "ERROR: Failed to scale Model B deployment"
exit 1
}
fi
echo "Waiting for Model B vLLM deployment to be ready (up to 25 minutes for model loading)..."
# Same as Model A1: use rollout status to wait for actual pod readiness.
kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --timeout=1500s || {
echo "WARNING: Model B deployment not ready after 25 minutes"
echo "=== Pod status ==="
kubectl get pods -n "$LLMD_NAMESPACE_B"
echo "=== Deployment conditions ==="
kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.status.conditions}' | jq . || true
echo "=== Recent events ==="
kubectl get events -n "$LLMD_NAMESPACE_B" --sort-by='.lastTimestamp' | tail -20
}
- name: Verify multi-model deployment
run: |
echo "=== Multi-Model Deployment Status ==="
echo ""
echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ==="
kubectl get deployment -n "$LLMD_NAMESPACE" | grep -E "decode|NAME" || true
kubectl get hpa -n "$LLMD_NAMESPACE" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE" || true
echo ""
echo "=== Model B ($LLMD_NAMESPACE_B) ==="
kubectl get deployment -n "$LLMD_NAMESPACE_B" | grep -E "decode|NAME" || true
kubectl get hpa -n "$LLMD_NAMESPACE_B" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" || true
echo ""
echo "=== WVA Controller ($WVA_NAMESPACE) ==="
kubectl get pods -n "$WVA_NAMESPACE"
- name: Verify metrics pipeline
run: |
echo "=== Verifying metrics pipeline before running tests ==="
echo ""
# 1. Verify vLLM pods are serving /metrics endpoint
echo "--- Step 1: Checking vLLM /metrics endpoint ---"
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
VLLM_POD=$(kubectl get pods -n "$ns" -l llm-d.ai/inference-serving=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -n "$VLLM_POD" ]; then
PORT="${VLLM_SVC_PORT:-8000}"
echo " Checking vLLM pod $VLLM_POD in $ns (port $PORT)..."
METRICS=$(kubectl exec -n "$ns" "$VLLM_POD" -- curl -s "http://localhost:${PORT}/metrics" 2>/dev/null | head -5 || true)
if [ -n "$METRICS" ]; then
echo " ✅ vLLM metrics endpoint responding in $ns"
else
echo " ⚠️ vLLM metrics endpoint not responding in $ns (may still be loading)"
fi
# Show pod labels for debugging
echo " Pod labels:"
kubectl get pod "$VLLM_POD" -n "$ns" -o jsonpath='{.metadata.labels}' | jq -r 'to_entries[] | " \(.key)=\(.value)"' 2>/dev/null || true
else
echo " ⚠️ No vLLM pods found with label llm-d.ai/inference-serving=true in $ns"
echo " All pods in $ns:"
kubectl get pods -n "$ns" --show-labels 2>/dev/null || true
fi
done
# 1b. Verify vllm-service has endpoints (critical for ServiceMonitor scraping)
echo ""
echo "--- Step 1b: Checking vllm-service endpoints ---"
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
SVC_NAME=$(kubectl get svc -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -n "$SVC_NAME" ]; then
ENDPOINTS=$(kubectl get endpoints "$SVC_NAME" -n "$ns" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true)
if [ -n "$ENDPOINTS" ]; then
echo " ✅ Service $SVC_NAME in $ns has endpoints: $ENDPOINTS"
else
echo " ❌ Service $SVC_NAME in $ns has NO endpoints — label selector mismatch!"
echo " Service selector:"
kubectl get svc "$SVC_NAME" -n "$ns" -o jsonpath='{.spec.selector}' 2>/dev/null | jq . || true
fi
else
echo " ⚠️ No vllm-service found in $ns"
fi
done
# 1c. Check PodMonitors (llm-d guide deploys these for direct pod scraping)
echo ""
echo "--- Step 1c: PodMonitor configuration ---"
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
PM_COUNT=$(kubectl get podmonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ')
echo " PodMonitors in $ns: $PM_COUNT"
kubectl get podmonitor -n "$ns" 2>/dev/null || true
done
# 2. Check WVA controller health
echo ""
echo "--- Step 2: WVA controller status ---"
kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler
WVA_POD=$(kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [ -n "$WVA_POD" ]; then
echo " Recent WVA controller logs:"
kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=20 | grep -E "reconcil|metrics|error|saturation" || echo " (no matching log lines)"
fi
# 3. Check VariantAutoscaling status
echo ""
echo "--- Step 3: VariantAutoscaling status ---"
kubectl get variantautoscaling -A -o wide 2>/dev/null || echo " No VariantAutoscalings found"
# 4. Check ServiceMonitors exist
echo ""
echo "--- Step 4: ServiceMonitor configuration ---"
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
SM_COUNT=$(kubectl get servicemonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ')
echo " ServiceMonitors in $ns: $SM_COUNT"
kubectl get servicemonitor -n "$ns" 2>/dev/null || true
done
# 5. Wait for WVA to start processing metrics (up to 3 minutes)
echo ""
echo "--- Step 5: Waiting for WVA to detect metrics (up to 3 minutes) ---"
METRICS_READY=false
for i in $(seq 1 18); do
VA_STATUS=$(kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -o jsonpath='{.items[0].status.desiredOptimizedAlloc.accelerator}' 2>/dev/null || true)
if [ -n "$VA_STATUS" ]; then
echo " ✅ WVA optimization active — accelerator: $VA_STATUS"
METRICS_READY=true
break
fi
echo " Attempt $i/18: WVA not yet optimizing, waiting 10s..."
sleep 10
done
if [ "$METRICS_READY" = "false" ]; then
echo " ⚠️ WVA has not started optimizing after 3 minutes"
echo " This may cause test timeouts — dumping diagnostics:"
echo ""
echo " === WVA controller logs (last 50 lines) ==="
kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=50 2>/dev/null || true
echo ""
echo " === HPA status ==="
kubectl get hpa -A 2>/dev/null || true
echo ""
echo " Continuing to tests anyway (they have their own timeouts)..."
fi
echo ""
echo "=== Metrics pipeline verification complete ==="
- name: Install Go dependencies
run: |
GOTOOLCHAIN=auto go version
GOTOOLCHAIN=auto go env GOTOOLCHAIN
GOTOOLCHAIN=auto go mod download
- name: Run OpenShift E2E tests
env:
# Consolidated e2e test environment variables
ENVIRONMENT: openshift
USE_SIMULATOR: "true"
SCALE_TO_ZERO_ENABLED: "true"
WVA_NAMESPACE: ${{ env.WVA_NAMESPACE }}
MONITORING_NAMESPACE: openshift-user-workload-monitoring
LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
# Legacy variables for backward compatibility (if needed by tests)
CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
# Multi-model testing: secondary namespace for Model B
LLMD_NAMESPACE_B: ${{ env.LLMD_NAMESPACE_B }}
GATEWAY_NAME: infra-inference-scheduling-inference-gateway-istio
DEPLOYMENT: ms-inference-scheduling-llm-d-modelservice-decode
# Pass WVA_RELEASE_NAME so test can filter for current run's resources
WVA_RELEASE_NAME: ${{ env.WVA_RELEASE_NAME }}
# Controller instance label must match what the controller was deployed with
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
MODEL_ID: ${{ env.MODEL_ID }}
REQUEST_RATE: ${{ env.REQUEST_RATE }}
NUM_PROMPTS: ${{ env.NUM_PROMPTS }}
run: |
echo "Running consolidated E2E tests on OpenShift with configuration:"
echo " ENVIRONMENT: $ENVIRONMENT"
echo " USE_SIMULATOR: $USE_SIMULATOR"
echo " SCALE_TO_ZERO_ENABLED: $SCALE_TO_ZERO_ENABLED"
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
echo " MONITORING_NAMESPACE: $MONITORING_NAMESPACE"
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
echo " DEPLOYMENT: $DEPLOYMENT"
echo " GATEWAY_NAME: $GATEWAY_NAME"
echo " MODEL_ID: $MODEL_ID"
echo " REQUEST_RATE: $REQUEST_RATE"
echo " NUM_PROMPTS: $NUM_PROMPTS"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
make test-e2e-full
- name: Cleanup infrastructure
# Cleanup on success or cancellation, but NOT on failure (preserve for debugging)
# Use SKIP_CLEANUP=true to keep resources after successful runs
if: (success() || cancelled()) && env.SKIP_CLEANUP != 'true'
run: |
echo "Cleaning up ALL test infrastructure..."
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
echo " MODEL_B_RELEASE: $MODEL_B_RELEASE"
# Uninstall all WVA helm releases before deleting namespaces
# This ensures proper cleanup of resources and removes helm tracking
echo "Uninstalling WVA helm releases..."
helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
echo "Uninstalling llm-d helm releases in primary namespace..."
for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
echo " Uninstalling release: $release"
helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
done
echo "Uninstalling llm-d helm releases in secondary namespace..."
for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do
echo " Uninstalling release: $release"
helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s || true
done
# Delete all PR-specific namespaces
echo "Deleting llm-d namespace $LLMD_NAMESPACE..."
kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true
echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..."
kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s || true
echo "Deleting WVA namespace $WVA_NAMESPACE..."
kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true
# Clean up cluster-scoped WVA resources for THIS release only
# Use both name and instance labels to avoid deleting resources from other PRs
echo "Removing cluster-scoped WVA resources for release $WVA_RELEASE_NAME..."
kubectl delete clusterrole,clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler,app.kubernetes.io/instance="$WVA_RELEASE_NAME" --ignore-not-found || true
# Also clean up cluster-scoped resources owned by this PR's namespaces
# (covers helmfile-created resources whose instance label differs from WVA_RELEASE_NAME)
for kind in clusterrole clusterrolebinding; do
kubectl get "$kind" -o json 2>/dev/null | \
jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | "\(.metadata.name)\t\(.metadata.annotations["meta.helm.sh/release-namespace"] // "")"' 2>/dev/null | \
while IFS=$'\t' read -r name ns; do
if [ "$ns" = "$LLMD_NAMESPACE" ] || [ "$ns" = "$LLMD_NAMESPACE_B" ] || [ "$ns" = "$WVA_NAMESPACE" ]; then
echo " Deleting $kind/$name (owned by PR namespace '$ns')"
kubectl delete "$kind" "$name" --ignore-not-found || true
fi
done
done
echo "Cleanup complete"
- name: Dump cluster state
if: always()
run: |
echo "=== Dumping cluster state for diagnostics ==="
echo ""
echo "=== VAs ==="
kubectl get va -n "$LLMD_NAMESPACE" 2>/dev/null || true
kubectl get va -n "$LLMD_NAMESPACE_B" 2>/dev/null || true
echo ""
echo "=== HPAs ==="
kubectl get hpa -n "$LLMD_NAMESPACE" 2>/dev/null || true
kubectl get hpa -n "$LLMD_NAMESPACE_B" 2>/dev/null || true
echo ""
echo "=== Controller pods ==="
kubectl get pods -n "$WVA_NAMESPACE" 2>/dev/null || true
echo ""
echo "=== All resources ==="
for ns in "$WVA_NAMESPACE" "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
if kubectl get namespace "$ns" &>/dev/null; then
echo "--- Namespace: $ns ---"
kubectl get all -n "$ns" 2>/dev/null || true
echo ""
echo "--- Events in $ns ---"
kubectl get events -n "$ns" --sort-by='.lastTimestamp' 2>/dev/null | tail -20 || true
echo ""
fi
done
- name: Scale down GPU workloads on failure
# On failure, scale down decode deployments to free GPUs while preserving
# other resources (VA, HPA, controller, gateway) for debugging
if: failure()
run: |
echo "Test failed - scaling down decode deployments to free GPUs..."
echo "Other resources (VA, HPA, controller logs) are preserved for debugging"
echo ""
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
if kubectl get namespace "$ns" &>/dev/null; then
echo "=== Scaling down decode deployments in $ns ==="
kubectl scale deployment -n "$ns" -l llm-d.ai/inferenceServing=true --replicas=0 || true
# Also try by name pattern in case labels are missing
kubectl get deployment -n "$ns" -o name 2>/dev/null | grep decode | while read -r deploy; do
echo " Scaling down: $deploy"
kubectl scale "$deploy" -n "$ns" --replicas=0 || true
done
fi
done
# Report status back to PR for issue_comment triggered runs
# This ensures fork PRs show the correct status after /ok-to-test runs complete
report-status:
runs-on: ubuntu-latest
needs: [gate, e2e-openshift]
# Run always (even on failure) but only for issue_comment events
if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true'
steps:
- name: Report status to PR
uses: actions/github-script@v7
with:
script: |
const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
const e2eResult = '${{ needs.e2e-openshift.result }}';
// Map job result to commit status
let state, description;
if (e2eResult === 'success') {
state = 'success';
description = 'E2E tests passed';
} else if (e2eResult === 'skipped') {
state = 'pending';
description = 'E2E tests skipped';
} else if (e2eResult === 'cancelled') {
state = 'failure';
description = 'E2E tests cancelled';
} else {
state = 'failure';
description = 'E2E tests failed';
}
console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: prHeadSha,
state: state,
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: description,
context: '${{ github.workflow }} / e2e (comment trigger)'
});
console.log('Status reported successfully');