Add optimization loop performance metrics #2954
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI - OpenShift E2E Tests | |
| # Permissions needed for various jobs | |
| permissions: | |
| contents: read | |
| packages: write | |
| pull-requests: write # For posting comments on PRs | |
| statuses: write # For reporting status on fork PR commits | |
| # Cancel previous runs on the same PR to avoid resource conflicts | |
| # Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments) | |
| # Regular comments get a unique group (run_id) so they don't cancel in-progress test runs | |
| # | |
| # Logic: | |
| # - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests | |
| # - Valid triggers: group 'e2e-openshift-{pr_number}' (can cancel previous runs for same PR) | |
| # - Fallback chain for ID: pull_request.number -> issue.number -> run_id | |
| # | |
| # NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation (line ~125) | |
| concurrency: | |
| group: >- | |
| ${{ | |
| github.event_name == 'issue_comment' && | |
| !contains(github.event.comment.body, '/ok-to-test') && | |
| !contains(github.event.comment.body, '/retest') | |
| && format('comment-isolated-{0}', github.run_id) | |
| || format('e2e-openshift-{0}', | |
| github.event.pull_request.number | |
| || github.event.issue.number | |
| || github.run_id) | |
| }} | |
| cancel-in-progress: true | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| - dev | |
| # Allow maintainers to trigger tests on fork PRs via /ok-to-test comment | |
| issue_comment: | |
| types: [created] | |
| workflow_dispatch: | |
| inputs: | |
| model_id: | |
| description: 'Model ID' | |
| required: false | |
| default: 'unsloth/Meta-Llama-3.1-8B' | |
| accelerator_type: | |
| description: 'Accelerator type (H100, A100, L40S)' | |
| required: false | |
| default: 'H100' | |
| request_rate: | |
| description: 'Request rate (req/s)' | |
| required: false | |
| default: '20' | |
| num_prompts: | |
| description: 'Number of prompts' | |
| required: false | |
| default: '3000' | |
| skip_cleanup: | |
| description: 'Skip cleanup after tests' | |
| required: false | |
| default: 'false' | |
| max_num_seqs: | |
| description: 'vLLM max batch size (lower = easier to saturate)' | |
| required: false | |
| default: '1' | |
| hpa_stabilization_seconds: | |
| description: 'HPA stabilization window in seconds' | |
| required: false | |
| default: '240' | |
| jobs: | |
| # Check if PR contains code changes (not just docs/metadata) | |
| check-code-changes: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| pull-requests: read | |
| outputs: | |
| has_code_changes: ${{ steps.set-output.outputs.has_code_changes }} | |
| steps: | |
| - name: Checkout source | |
| if: github.event_name == 'pull_request' | |
| uses: actions/checkout@v4 | |
| - name: Check for code changes | |
| if: github.event_name == 'pull_request' | |
| uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| filters: | | |
| code: | |
| - '!docs/**' | |
| - '!README.md' | |
| - '!CONTRIBUTING.md' | |
| - '!LICENSE' | |
| - '!OWNERS' | |
| - '!PROJECT' | |
| - name: Set output | |
| id: set-output | |
| run: | | |
| if [ "${{ github.event_name }}" != "pull_request" ]; then | |
| # Always run for issue_comment (/ok-to-test, /retest) and workflow_dispatch | |
| echo "has_code_changes=true" >> $GITHUB_OUTPUT | |
| elif [ -n "${{ steps.filter.outputs.code }}" ]; then | |
| echo "has_code_changes=${{ steps.filter.outputs.code }}" >> $GITHUB_OUTPUT | |
| else | |
| echo "has_code_changes=true" >> $GITHUB_OUTPUT | |
| fi | |
| # Gate: Check permissions and handle /ok-to-test for fork PRs. | |
| # - Maintainers (write access): Tests run automatically on pull_request. | |
| # - Fork PRs: Gate succeeds (no failure) so the PR does not show a false red check; E2E runs | |
| # only after a maintainer comments /ok-to-test. Branch protection should require the | |
| # "e2e-openshift" job so merge stays blocked until that run passes. | |
| gate: | |
| needs: check-code-changes | |
| if: needs.check-code-changes.outputs.has_code_changes == 'true' | |
| runs-on: ubuntu-latest | |
| outputs: | |
| should_run: ${{ steps.check.outputs.should_run }} | |
| pr_number: ${{ steps.check.outputs.pr_number }} | |
| pr_head_sha: ${{ steps.check.outputs.pr_head_sha }} | |
| is_fork_pr: ${{ steps.check.outputs.is_fork_pr }} | |
| steps: | |
| - name: Check permissions and OpenShift E2E triggers (/ok-to-test, /retest) | |
| id: check | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| // Helper to check if user has write access | |
| async function hasWriteAccess(username) { | |
| try { | |
| const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| username: username | |
| }); | |
| const privilegedRoles = ['admin', 'maintain', 'write']; | |
| return privilegedRoles.includes(permission.permission); | |
| } catch (e) { | |
| console.log(`Could not get permissions for ${username}: ${e.message}`); | |
| return false; | |
| } | |
| } | |
| // Always run for workflow_dispatch | |
| if (context.eventName === 'workflow_dispatch') { | |
| core.setOutput('should_run', 'true'); | |
| core.setOutput('pr_number', ''); | |
| core.setOutput('pr_head_sha', context.sha); | |
| core.setOutput('is_fork_pr', 'false'); | |
| return; | |
| } | |
| // Handle issue_comment event (/ok-to-test or /retest) | |
| if (context.eventName === 'issue_comment') { | |
| const comment = context.payload.comment.body.trim(); | |
| const issue = context.payload.issue; | |
| // Only process /ok-to-test or /retest comments on PRs | |
| if (!issue.pull_request) { | |
| console.log('Comment is not on a PR, skipping'); | |
| core.setOutput('should_run', 'false'); | |
| return; | |
| } | |
| // NOTE: This list must stay in sync with concurrency group logic (lines 23-25) | |
| const validCommands = ['/ok-to-test', '/retest']; | |
| if (!validCommands.includes(comment)) { | |
| console.log(`Comment "${comment}" is not a valid trigger command, skipping`); | |
| core.setOutput('should_run', 'false'); | |
| return; | |
| } | |
| // Check if commenter has write access | |
| const commenter = context.payload.comment.user.login; | |
| const hasAccess = await hasWriteAccess(commenter); | |
| if (!hasAccess) { | |
| console.log(`User ${commenter} does not have write access, ignoring ${comment}`); | |
| core.setOutput('should_run', 'false'); | |
| return; | |
| } | |
| // Get PR details to get head SHA | |
| const { data: pr } = await github.rest.pulls.get({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| pull_number: issue.number | |
| }); | |
| // Check if PR is from a fork | |
| const baseRepo = `${context.repo.owner}/${context.repo.repo}`; | |
| const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo; | |
| const isFork = headRepo !== baseRepo; | |
| console.log(`${comment} approved by ${commenter} for PR #${issue.number}`); | |
| console.log(`PR head SHA: ${pr.head.sha}`); | |
| console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`); | |
| core.setOutput('should_run', 'true'); | |
| core.setOutput('pr_number', issue.number.toString()); | |
| core.setOutput('pr_head_sha', pr.head.sha); | |
| core.setOutput('is_fork_pr', isFork ? 'true' : 'false'); | |
| // Add reaction to acknowledge | |
| await github.rest.reactions.createForIssueComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| comment_id: context.payload.comment.id, | |
| content: 'rocket' | |
| }); | |
| // Post comment with link to the e2e workflow run | |
| const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; | |
| const cmdDesc = comment === '/ok-to-test' ? 'approve and run' : 're-run'; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: issue.number, | |
| body: `🚀 **OpenShift E2E** — ${cmdDesc} (\`${comment}\`)\n\n[View the OpenShift E2E workflow run](${runUrl})` | |
| }); | |
| return; | |
| } | |
| // Handle pull_request event | |
| const pr = context.payload.pull_request; | |
| const prAuthor = pr.user.login; | |
| const prNumber = pr.number; | |
| const prHeadSha = pr.head.sha; | |
| // Check if PR is from a fork | |
| const baseRepo = `${context.repo.owner}/${context.repo.repo}`; | |
| const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo; | |
| const isFork = headRepo !== baseRepo; | |
| console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`); | |
| core.setOutput('pr_number', prNumber.toString()); | |
| core.setOutput('pr_head_sha', prHeadSha); | |
| core.setOutput('is_fork_pr', isFork ? 'true' : 'false'); | |
| // Check if PR author has write access | |
| const isPrivileged = await hasWriteAccess(prAuthor); | |
| console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`); | |
| // Check if we already posted a bot comment | |
| const comments = await github.rest.issues.listComments({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber | |
| }); | |
| const botComment = comments.data.find(c => | |
| c.user.type === 'Bot' && | |
| c.body.includes('ok-to-test') | |
| ); | |
| // Helper to safely post a comment (may fail on fork PRs due to permissions) | |
| async function tryPostComment(body) { | |
| try { | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body: body | |
| }); | |
| return true; | |
| } catch (e) { | |
| // Fork PRs can't post comments on pull_request event (GitHub security restriction) | |
| console.log(`Could not post comment (expected for fork PRs): ${e.message}`); | |
| return false; | |
| } | |
| } | |
| if (isPrivileged) { | |
| // For maintainer/admin fork PRs, we need to trigger via /ok-to-test | |
| // because fork PRs don't have access to secrets on pull_request event | |
| if (isFork) { | |
| console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`); | |
| core.setOutput('should_run', 'false'); // Don't run on pull_request event | |
| // Auto-post /ok-to-test to trigger issue_comment workflow | |
| if (!botComment) { | |
| const posted = await tryPostComment(`/ok-to-test`); | |
| if (!posted) { | |
| console.log('Note: Maintainer will need to manually comment /ok-to-test'); | |
| } | |
| } | |
| // Do not fail the gate: fork PRs cannot run E2E on pull_request (no secrets). | |
| // Gate succeeds so the PR does not show a false failure; branch protection | |
| // should require "e2e-openshift" so merge stays blocked until /ok-to-test run passes. | |
| return; | |
| } | |
| // Non-fork PR from maintainer - run directly | |
| core.setOutput('should_run', 'true'); | |
| return; | |
| } | |
| // External contributor - post instructions and skip | |
| console.log('External contributor PR - posting instructions'); | |
| core.setOutput('should_run', 'false'); | |
| if (!botComment) { | |
| const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so **OpenShift E2E** (GPU) tests require approval to run.\n\n**For maintainers/admins:** Comment \`/ok-to-test\` to approve and trigger **OpenShift E2E** on this PR, or \`/retest\` to re-run OpenShift E2E (e.g. after a failure or new commits).\n\n**For contributors:** Please wait for a maintainer or admin to approve running the tests.`); | |
| if (!posted) { | |
| console.log('Note: Could not post instructions comment on fork PR'); | |
| } | |
| } | |
| // Do not fail the gate: GitHub does not allow updating status from upstream on fork | |
| // PRs, so a failed gate would stay red even after /ok-to-test run passes. Let the gate | |
| // succeed; branch protection should require "e2e-openshift" so merge stays blocked | |
| // until a maintainer comments /ok-to-test and E2E passes. | |
| - name: Write workflow summary | |
| if: always() | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const shouldRun = '${{ steps.check.outputs.should_run }}'; | |
| const isFork = '${{ steps.check.outputs.is_fork_pr }}'; | |
| const eventName = '${{ github.event_name }}'; | |
| if (shouldRun === 'true') { | |
| core.summary.addRaw('✅ **E2E tests will run** for this trigger.\n').write(); | |
| } else if (isFork === 'true' && eventName === 'pull_request') { | |
| core.summary.addRaw([ | |
| '⏸️ **E2E tests skipped — fork PR**\n\n', | |
| 'Fork PRs cannot run E2E on `pull_request` events (no access to secrets/GPU runners).\n\n', | |
| 'A maintainer must comment \`/ok-to-test\` to trigger the **OpenShift E2E** suite. ', | |
| 'Branch protection should require **e2e-openshift** so merge stays blocked until E2E passes.\n', | |
| ].join('')).write(); | |
| } else { | |
| core.summary.addRaw('⏸️ **E2E tests were skipped** (gate check did not pass for this trigger).\n').write(); | |
| } | |
| # Build the WVA controller image on GitHub-hosted runner (has proper Docker setup) | |
| # Note: Skip for fork PRs on pull_request event (no secrets access). | |
| # For fork PRs, build-image runs via issue_comment trigger (/ok-to-test). | |
| build-image: | |
| needs: gate | |
| if: | | |
| needs.gate.outputs.should_run == 'true' && | |
| (needs.gate.outputs.is_fork_pr != 'true' || github.event_name != 'pull_request') | |
| runs-on: ubuntu-latest | |
| outputs: | |
| image_tag: ${{ steps.build.outputs.image_tag }} | |
| steps: | |
| - name: Checkout source | |
| uses: actions/checkout@v4 | |
| with: | |
| # Use PR head SHA from gate (works for both pull_request and issue_comment) | |
| ref: ${{ needs.gate.outputs.pr_head_sha }} | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ secrets.CR_USER }} | |
| password: ${{ secrets.CR_TOKEN }} | |
| - name: Build and push image | |
| id: build | |
| env: | |
| REGISTRY: ghcr.io | |
| IMAGE_NAME: ${{ github.repository }} | |
| # Use PR head SHA from gate | |
| GIT_REF: ${{ needs.gate.outputs.pr_head_sha }} | |
| run: | | |
| # Build image with git ref tag for this PR | |
| # Use first 8 chars of the git ref (POSIX-compliant) | |
| IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" | cut -c1-8)" | |
| FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" | |
| echo "Building image: $FULL_IMAGE" | |
| echo "Git ref: $GIT_REF" | |
| # Build and push using make targets | |
| make docker-build IMG="$FULL_IMAGE" | |
| make docker-push IMG="$FULL_IMAGE" | |
| echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT | |
| echo "Image built and pushed: $FULL_IMAGE" | |
| # Run e2e tests on OpenShift self-hosted runner (vllm-d cluster). | |
| # pok-prod runners are reserved for nightly E2E only. | |
| e2e-openshift: | |
| runs-on: [self-hosted, openshift, vllm-d] | |
| needs: [gate, build-image] | |
| if: needs.gate.outputs.should_run == 'true' | |
| env: | |
| MODEL_ID: ${{ github.event.inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }} | |
| GOTOOLCHAIN: auto | |
| ACCELERATOR_TYPE: ${{ github.event.inputs.accelerator_type || 'A100' }} | |
| REQUEST_RATE: ${{ github.event.inputs.request_rate || '20' }} | |
| NUM_PROMPTS: ${{ github.event.inputs.num_prompts || '3000' }} | |
| MAX_NUM_SEQS: ${{ github.event.inputs.max_num_seqs || '5' }} | |
| HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '240' }} | |
| SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }} | |
| # Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support) | |
| LLM_D_RELEASE: main | |
| # PR-specific namespaces for isolation between concurrent PR tests | |
| # Primary llm-d namespace (Model A1 + A2) | |
| LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }} | |
| # Secondary llm-d namespace (Model B) | |
| LLMD_NAMESPACE_B: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}-b | |
| # WVA controller namespace (monitors all models) | |
| WVA_NAMESPACE: llm-d-autoscaler-pr-${{ needs.gate.outputs.pr_number || github.run_id }} | |
| # Unique release names per run to avoid conflicts | |
| WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }} | |
| # Model A1: Primary deployment in LLMD_NAMESPACE | |
| MODEL_A1_RELEASE: model-a1-${{ github.run_id }} | |
| # Model B: Deployment in LLMD_NAMESPACE_B | |
| MODEL_B_RELEASE: model-b-${{ github.run_id }} | |
| # Use the image built in the previous job | |
| WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }} | |
| steps: | |
| - name: Checkout source | |
| uses: actions/checkout@v4 | |
| with: | |
| # Use PR head SHA from gate (works for both pull_request and issue_comment) | |
| ref: ${{ needs.gate.outputs.pr_head_sha }} | |
| - name: Set up Go | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: "1.25.x" | |
| cache-dependency-path: ./go.sum | |
| - name: Verify Go toolchain | |
| run: | | |
| which go | |
| go version | |
| go env GOTOOLCHAIN | |
| - name: Install tools (kubectl, oc, helm, make) | |
| run: | | |
| sudo apt-get update && sudo apt-get install -y make | |
| # Install kubectl - use pinned version for reproducible CI builds | |
| # Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+ | |
| # Update this version when upgrading target cluster or during regular dependency reviews | |
| KUBECTL_VERSION="v1.31.0" | |
| echo "Installing kubectl version: $KUBECTL_VERSION" | |
| curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" | |
| curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256" | |
| echo "$(cat kubectl.sha256) kubectl" | sha256sum --check | |
| chmod +x kubectl | |
| sudo mv kubectl /usr/local/bin/ | |
| rm -f kubectl.sha256 | |
| # Install oc (OpenShift CLI) | |
| curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz" | |
| tar -xzf openshift-client-linux.tar.gz | |
| sudo mv oc /usr/local/bin/ | |
| rm -f openshift-client-linux.tar.gz kubectl README.md | |
| # Install helm | |
| curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash | |
| - name: Verify cluster access | |
| run: | | |
| echo "Verifying cluster access..." | |
| kubectl cluster-info | |
| kubectl get nodes | |
| - name: Verify correct cluster (vllm-d, not pok-prod) | |
| run: | | |
| # PR E2E tests must run on the vllm-d cluster, not pok-prod-sa. | |
| # pok-prod-sa is reserved for nightly E2E runs only. | |
| # Runners with the 'pok-prod' label connect to pok-prod-sa; | |
| # runners without it connect to vllm-d. | |
| CLUSTER_API=$(kubectl cluster-info 2>/dev/null | head -1 | grep -oE 'https://[^ ]+') | |
| echo "Cluster API: $CLUSTER_API" | |
| if echo "$CLUSTER_API" | grep -q "pokprod"; then | |
| echo "::error::This runner is connected to pok-prod-sa, but PR E2E tests must run on vllm-d." | |
| echo "::error::The runner likely has the 'pok-prod' label. PR CI should only use vllm-d runners." | |
| exit 1 | |
| fi | |
| echo "Cluster verified: running on vllm-d" | |
| - name: Check GPU availability | |
| id: gpu-check | |
| run: | | |
| echo "Checking GPU availability for e2e test..." | |
| # Minimum GPUs needed: 2 models × 2 GPUs each = 4 | |
| # Recommended with scale-up headroom: 6 | |
| REQUIRED_GPUS=4 | |
| RECOMMENDED_GPUS=6 | |
| # Total allocatable GPUs across all nodes | |
| TOTAL_GPUS=$(kubectl get nodes -o json | \ | |
| jq '[.items[].status.allocatable["nvidia.com/gpu"] // "0" | tonumber] | add // 0') | |
| # Currently requested GPUs by all pods | |
| ALLOCATED_GPUS=$(kubectl get pods --all-namespaces -o json | \ | |
| jq '[.items[] | select(.status.phase == "Running" or .status.phase == "Pending") | .spec.containers[]?.resources.requests["nvidia.com/gpu"] // "0" | tonumber] | add // 0') | |
| AVAILABLE_GPUS=$((TOTAL_GPUS - ALLOCATED_GPUS)) | |
| # Total allocatable CPU (cores) and memory (Gi) across all nodes | |
| # CPU may be in millicores (e.g. "8000m") or cores (e.g. "8") | |
| TOTAL_CPU=$(kubectl get nodes -o json | \ | |
| jq '[.items[].status.allocatable.cpu // "0" | if endswith("m") then (gsub("m$";"") | tonumber / 1000) else tonumber end] | add | floor') | |
| TOTAL_MEM_KI=$(kubectl get nodes -o json | \ | |
| jq '[.items[].status.allocatable.memory // "0" | gsub("[^0-9]";"") | tonumber] | add') | |
| TOTAL_MEM_GI=$((TOTAL_MEM_KI / 1048576)) | |
| NODE_COUNT=$(kubectl get nodes --no-headers | wc -l | tr -d ' ') | |
| GPU_NODE_COUNT=$(kubectl get nodes -o json | \ | |
| jq '[.items[] | select((.status.allocatable["nvidia.com/gpu"] // "0" | tonumber) > 0)] | length') | |
| # Export all values for the PR comment step | |
| echo "total_gpus=$TOTAL_GPUS" >> $GITHUB_OUTPUT | |
| echo "allocated_gpus=$ALLOCATED_GPUS" >> $GITHUB_OUTPUT | |
| echo "available_gpus=$AVAILABLE_GPUS" >> $GITHUB_OUTPUT | |
| echo "total_cpu=$TOTAL_CPU" >> $GITHUB_OUTPUT | |
| echo "total_mem_gi=$TOTAL_MEM_GI" >> $GITHUB_OUTPUT | |
| echo "node_count=$NODE_COUNT" >> $GITHUB_OUTPUT | |
| echo "gpu_node_count=$GPU_NODE_COUNT" >> $GITHUB_OUTPUT | |
| echo "required_gpus=$REQUIRED_GPUS" >> $GITHUB_OUTPUT | |
| echo "recommended_gpus=$RECOMMENDED_GPUS" >> $GITHUB_OUTPUT | |
| echo "## GPU Status" >> $GITHUB_STEP_SUMMARY | |
| echo "| Metric | Count |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Total cluster GPUs | $TOTAL_GPUS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Currently allocated | $ALLOCATED_GPUS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Available | $AVAILABLE_GPUS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Required (minimum) | $REQUIRED_GPUS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Recommended (with scale-up) | $RECOMMENDED_GPUS |" >> $GITHUB_STEP_SUMMARY | |
| if [ "$AVAILABLE_GPUS" -lt "$REQUIRED_GPUS" ]; then | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "❌ **Insufficient GPUs** — need $REQUIRED_GPUS but only $AVAILABLE_GPUS available. Re-run when GPUs free up." >> $GITHUB_STEP_SUMMARY | |
| echo "::error::Insufficient GPUs: need $REQUIRED_GPUS, have $AVAILABLE_GPUS available. Try again later." | |
| echo "gpu_available=false" >> $GITHUB_OUTPUT | |
| exit 1 | |
| elif [ "$AVAILABLE_GPUS" -lt "$RECOMMENDED_GPUS" ]; then | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "⚠️ **Low GPU headroom** — $AVAILABLE_GPUS available (need $RECOMMENDED_GPUS for scale-up tests). Tests may fail during scale-up." >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::Low GPU headroom: $AVAILABLE_GPUS available, $RECOMMENDED_GPUS recommended for scale-up tests" | |
| echo "gpu_available=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "✅ **GPUs available** — $AVAILABLE_GPUS GPUs free ($REQUIRED_GPUS required, $RECOMMENDED_GPUS recommended)" >> $GITHUB_STEP_SUMMARY | |
| echo "gpu_available=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Post GPU status to PR | |
| if: always() && needs.gate.outputs.pr_number != '' | |
| continue-on-error: true | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| PR_NUMBER: ${{ needs.gate.outputs.pr_number }} | |
| run: | | |
| GPU_STATUS="${{ steps.gpu-check.outcome }}" | |
| GPU_AVAIL="${{ steps.gpu-check.outputs.gpu_available }}" | |
| TOTAL_GPUS="${{ steps.gpu-check.outputs.total_gpus }}" | |
| ALLOCATED_GPUS="${{ steps.gpu-check.outputs.allocated_gpus }}" | |
| AVAILABLE_GPUS="${{ steps.gpu-check.outputs.available_gpus }}" | |
| TOTAL_CPU="${{ steps.gpu-check.outputs.total_cpu }}" | |
| TOTAL_MEM_GI="${{ steps.gpu-check.outputs.total_mem_gi }}" | |
| NODE_COUNT="${{ steps.gpu-check.outputs.node_count }}" | |
| GPU_NODE_COUNT="${{ steps.gpu-check.outputs.gpu_node_count }}" | |
| REQUIRED_GPUS="${{ steps.gpu-check.outputs.required_gpus }}" | |
| RECOMMENDED_GPUS="${{ steps.gpu-check.outputs.recommended_gpus }}" | |
| NL=$'\n' | |
| TABLE="| Resource | Total | Allocated | Available |${NL}|----------|-------|-----------|----------|${NL}| GPUs | $TOTAL_GPUS | $ALLOCATED_GPUS | **$AVAILABLE_GPUS** |${NL}${NL}| Cluster | Value |${NL}|---------|-------|${NL}| Nodes | $NODE_COUNT ($GPU_NODE_COUNT with GPUs) |${NL}| Total CPU | ${TOTAL_CPU} cores |${NL}| Total Memory | ${TOTAL_MEM_GI} Gi |${NL}| GPUs required | $REQUIRED_GPUS (min) / $RECOMMENDED_GPUS (recommended) |" | |
| if [ "$GPU_STATUS" = "failure" ]; then | |
| HEADER="### GPU Pre-flight Check ❌" | |
| MSG="**Insufficient GPUs** to run OpenShift E2E. Re-run with \`/retest\` (OpenShift E2E) when GPUs free up." | |
| elif [ "$GPU_AVAIL" = "true" ]; then | |
| HEADER="### GPU Pre-flight Check ✅" | |
| MSG="GPUs are available for e2e-openshift tests. Proceeding with deployment." | |
| else | |
| HEADER="### GPU Pre-flight Check ⚠️" | |
| MSG="Low GPU headroom — tests may fail during scale-up phases." | |
| fi | |
| BODY="${HEADER}${NL}${MSG}${NL}${NL}${TABLE}" | |
| PAYLOAD=$(jq -n --arg body "$BODY" '{"body": $body}') | |
| curl -s -X POST \ | |
| -H "Authorization: token $GH_TOKEN" \ | |
| -H "Accept: application/vnd.github.v3+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/issues/$PR_NUMBER/comments" \ | |
| -d "$PAYLOAD" | |
| - name: Get HF token from cluster secret | |
| id: hf-token | |
| run: | | |
| echo "Reading HF token from cluster secret llm-d-hf-token in default namespace..." | |
| # The llm-d-hf-token secret exists in the default namespace on the cluster | |
| # Check secret existence separately from key retrieval for better error messages | |
| if ! kubectl get secret llm-d-hf-token -n default &>/dev/null; then | |
| echo "::error::Secret 'llm-d-hf-token' not found in default namespace" | |
| echo "::error::Please ensure the HF token secret exists on the cluster" | |
| exit 1 | |
| fi | |
| # Read the token and mask it in logs | |
| HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' | base64 -d) | |
| if [ -z "$HF_TOKEN" ]; then | |
| echo "::error::Secret 'llm-d-hf-token' exists but 'HF_TOKEN' key is empty or missing" | |
| exit 1 | |
| fi | |
| # Mask the token in workflow logs | |
| echo "::add-mask::$HF_TOKEN" | |
| # Export for subsequent steps | |
| echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV | |
| echo "HF token retrieved successfully from cluster secret" | |
| - name: Clean up resources for this PR | |
| run: | | |
| echo "Cleaning up WVA resources for this PR's namespaces only..." | |
| echo " LLMD_NAMESPACE: $LLMD_NAMESPACE" | |
| echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B" | |
| echo " WVA_NAMESPACE: $WVA_NAMESPACE" | |
| # Only clean up the 3 namespaces associated with THIS PR | |
| # Do NOT touch namespaces from other PRs to avoid race conditions | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do | |
| if kubectl get namespace "$ns" &>/dev/null; then | |
| echo "" | |
| echo "=== Cleaning up namespace: $ns ===" | |
| # Delete WVA resources in this namespace | |
| echo " Removing HPAs and VAs..." | |
| kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true | |
| kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true | |
| # Uninstall all helm releases in the namespace | |
| for release in $(helm list -n "$ns" -q 2>/dev/null); do | |
| echo " Uninstalling helm release: $release" | |
| helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| echo " Deleting namespace: $ns" | |
| kubectl delete namespace "$ns" --ignore-not-found --timeout=60s || true | |
| else | |
| echo "Namespace $ns does not exist, skipping cleanup" | |
| fi | |
| done | |
| # Clean up legacy namespaces if they exist (these are not PR-specific) | |
| for legacy_ns in llm-d-inference-scheduler workload-variant-autoscaler-system; do | |
| if kubectl get namespace "$legacy_ns" &>/dev/null; then | |
| echo "" | |
| echo "=== Cleaning up legacy namespace: $legacy_ns ===" | |
| # Uninstall all helm releases in the namespace first | |
| for release in $(helm list -n "$legacy_ns" -q 2>/dev/null); do | |
| echo " Uninstalling helm release: $release" | |
| helm uninstall "$release" -n "$legacy_ns" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| echo " Deleting namespace: $legacy_ns" | |
| kubectl delete namespace "$legacy_ns" --ignore-not-found --timeout=60s || true | |
| fi | |
| done | |
| # The helmfile uses a generic release name "workload-variant-autoscaler" which | |
| # produces non-unique ClusterRole names. On shared clusters, these resources | |
| # may be owned by another namespace's release, causing Helm ownership conflicts. | |
| # Fix: adopt them for our namespace so helmfile can proceed. Post-cleanup will | |
| # delete them, and the next user's helmfile run will recreate them fresh. | |
| # Only adopt legacy helmfile-style names (release "workload-variant-autoscaler"). | |
| # PR-specific Helm releases use names like wva-e2e-<run_id>; those live in WVA_NAMESPACE. | |
| # Re-annotating them to LLMD_NAMESPACE breaks Helm ownership and can leave the controller | |
| # ServiceAccount bound to a wrong or unmanaged ClusterRole (cluster-wide list/watch denied). | |
| echo "Adopting shared WVA cluster-scoped resources for namespace $LLMD_NAMESPACE..." | |
| for kind in clusterrole clusterrolebinding; do | |
| kubectl get "$kind" -o json 2>/dev/null | \ | |
| jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | select(.metadata.name | startswith("wva-e2e-") | not) | select(.metadata.annotations["meta.helm.sh/release-namespace"] != null) | .metadata.name' 2>/dev/null | \ | |
| while read -r name; do | |
| current_ns=$(kubectl get "$kind" "$name" -o json 2>/dev/null | jq -r '.metadata.annotations["meta.helm.sh/release-namespace"] // ""') | |
| if [ "$current_ns" != "$LLMD_NAMESPACE" ]; then | |
| echo " Adopting $kind/$name (was owned by '$current_ns')" | |
| kubectl annotate "$kind" "$name" \ | |
| "meta.helm.sh/release-name=workload-variant-autoscaler" \ | |
| "meta.helm.sh/release-namespace=$LLMD_NAMESPACE" \ | |
| --overwrite || true | |
| fi | |
| done | |
| done | |
| echo "" | |
| echo "Cleanup complete for this PR's namespaces" | |
| - name: Apply latest CRDs | |
| run: | | |
| echo "Applying latest VariantAutoscaling CRD..." | |
| # Helm doesn't auto-update CRDs, so we need to apply them manually | |
| # to ensure the cluster has the latest schema (including scaleTargetRef) | |
| kubectl apply -f charts/workload-variant-autoscaler/crds/ | |
| - name: Deploy WVA and llm-d infrastructure | |
| env: | |
| # HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step) | |
| ENVIRONMENT: openshift | |
| INSTALL_GATEWAY_CTRLPLANE: "false" | |
| E2E_TESTS_ENABLED: "true" | |
| # OpenShift typically lacks HPAScaleToZero; e2e forces SCALE_TO_ZERO_ENABLED off for openshift | |
| # (see test/e2e/config.go). KEDA ScaledObjects support minReplicas=0 for scale-from-zero tests. | |
| SCALER_BACKEND: keda | |
| NAMESPACE_SCOPED: "false" | |
| # Pass PR-specific namespaces to install script | |
| LLMD_NS: ${{ env.LLMD_NAMESPACE }} | |
| WVA_NS: ${{ env.WVA_NAMESPACE }} | |
| # Controller instance label for multi-controller isolation in parallel e2e tests | |
| CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }} | |
| # Skip infra VA/HPA — the smoke test creates its own VA+HPA targeting | |
| # its own deployment. The infra VA adds a second idle pod to the | |
| # saturation analysis group, diluting KV cache metrics and preventing | |
| # scale-up from triggering. | |
| DEPLOY_VA: "false" | |
| DEPLOY_HPA: "false" | |
| # vLLM max-num-seqs for e2e testing (lower = easier to saturate) | |
| VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }} | |
| # Decode replicas for e2e testing (start with 1 replica, let HPA scale) | |
| DECODE_REPLICAS: "1" | |
| # OpenShift uses built-in user-workload monitoring, not a separate namespace | |
| MONITORING_NAMESPACE: openshift-user-workload-monitoring | |
| # Disable bearer token auth on WVA /metrics endpoint — OpenShift's | |
| # user-workload-monitoring cannot authenticate with the controller-manager | |
| # SA token. The endpoint is still only accessible within the cluster network. | |
| WVA_METRICS_SECURE: "false" | |
| # Lower saturation thresholds for simulator mode — the simulator's | |
| # KV-cache and queue metrics are modest, so default thresholds | |
| # (kvSpareTrigger=0.1, queueSpareTrigger=3) are too high to trigger | |
| # scale-up reliably. These values trigger when kvUsage > 0.30 or | |
| # queueLength > 0.5, which the simulator produces under load. | |
| KV_SPARE_TRIGGER: "0.5" | |
| QUEUE_SPARE_TRIGGER: "4.5" | |
| # inference-scheduling guide has routing proxy disabled, so vLLM | |
| # serves directly on port 8000 (not 8200 behind proxy) | |
| VLLM_SVC_PORT: "8000" | |
| run: | | |
| echo "Deploying WVA and llm-d infrastructure..." | |
| echo " MODEL_ID: $MODEL_ID" | |
| echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE" | |
| echo " LLMD_NS: $LLMD_NS" | |
| echo " WVA_NS: $WVA_NS" | |
| echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME" | |
| echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG" | |
| echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE" | |
| echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS" | |
| echo " DECODE_REPLICAS: $DECODE_REPLICAS" | |
| echo " KV_SPARE_TRIGGER: ${KV_SPARE_TRIGGER:-<default>}" | |
| echo " QUEUE_SPARE_TRIGGER: ${QUEUE_SPARE_TRIGGER:-<default>}" | |
| echo " HF token configuration: ✓" | |
| ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift | |
| - name: Create secondary namespace for Model B | |
| run: | | |
| echo "Creating secondary namespace for Model B..." | |
| kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml | kubectl apply -f - | |
| echo "Secondary namespace $LLMD_NAMESPACE_B created" | |
| - name: Label namespaces for OpenShift monitoring | |
| run: | | |
| echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..." | |
| kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite | |
| kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite | |
| kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite | |
| echo "Namespace labels applied" | |
| - name: Wait for infrastructure to be ready | |
| run: | | |
| echo "Waiting for WVA controller to be ready..." | |
| kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s || true | |
| kubectl get pods -n "$WVA_NAMESPACE" | |
| # Ensure the vLLM deployment has the correct replica count. | |
| # A previous failed run's "Scale down GPU workloads" step may have set replicas=0 | |
| # and helmfile doesn't override manually-changed replicas on re-deploy. | |
| # kubectl rollout status returns instantly on 0-replica deployments, so we must | |
| # ensure replicas > 0 before waiting. | |
| DESIRED_REPLICAS="${DECODE_REPLICAS:-1}" | |
| CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") | |
| if [ "$CURRENT_REPLICAS" -eq 0 ]; then | |
| echo "WARNING: Model A1 deployment has 0 replicas (likely from previous failed run cleanup)" | |
| echo "Scaling to $DESIRED_REPLICAS replica(s)..." | |
| kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --replicas="$DESIRED_REPLICAS" || { | |
| echo "ERROR: Failed to scale Model A1 deployment" | |
| exit 1 | |
| } | |
| fi | |
| echo "Waiting for Model A1 vLLM deployment to be ready (up to 25 minutes for model loading)..." | |
| # kubectl rollout status waits for all replicas to be Ready, unlike | |
| # --for=condition=available which is satisfied even at 0 ready replicas. | |
| # vLLM model loading takes 15-20 minutes, so we use a 25-minute timeout. | |
| kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s || { | |
| echo "WARNING: Model A1 deployment not ready after 25 minutes" | |
| echo "=== Pod status ===" | |
| kubectl get pods -n "$LLMD_NAMESPACE" | |
| echo "=== Deployment conditions ===" | |
| kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" -o jsonpath='{.status.conditions}' | jq . || true | |
| echo "=== Recent events ===" | |
| kubectl get events -n "$LLMD_NAMESPACE" --sort-by='.lastTimestamp' | tail -20 | |
| } | |
| kubectl get pods -n "$LLMD_NAMESPACE" | |
| - name: Deploy Model B infrastructure in secondary namespace | |
| env: | |
| # HF_TOKEN is inherited from GITHUB_ENV | |
| ENVIRONMENT: openshift | |
| INSTALL_GATEWAY_CTRLPLANE: "false" | |
| E2E_TESTS_ENABLED: "true" | |
| SCALER_BACKEND: keda | |
| NAMESPACE_SCOPED: "false" | |
| # Override namespaces for Model B stack | |
| LLMD_NS: ${{ env.LLMD_NAMESPACE_B }} | |
| WVA_NS: ${{ env.WVA_NAMESPACE }} | |
| # Skip WVA controller and prometheus (use existing) | |
| DEPLOY_WVA: "false" | |
| DEPLOY_PROMETHEUS: "false" | |
| DEPLOY_PROMETHEUS_ADAPTER: "false" | |
| DEPLOY_VA: "false" | |
| DEPLOY_HPA: "false" | |
| # vLLM max-num-seqs for e2e testing (lower = easier to saturate) | |
| VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }} | |
| # Decode replicas for e2e testing (start with 1 replica, let HPA scale) | |
| DECODE_REPLICAS: "1" | |
| # OpenShift monitoring settings (same as Model A1 deploy) | |
| MONITORING_NAMESPACE: openshift-user-workload-monitoring | |
| WVA_METRICS_SECURE: "false" | |
| # Same port as Model A1 (inference-scheduling guide, proxy disabled) | |
| VLLM_SVC_PORT: "8000" | |
| run: | | |
| echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..." | |
| echo " MODEL_ID: $MODEL_ID" | |
| echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE" | |
| echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS" | |
| echo " DECODE_REPLICAS: $DECODE_REPLICAS" | |
| # Deploy llm-d infrastructure only (no WVA controller, no VA/HPA) | |
| ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift | |
| echo "Waiting for Model B deployment to start (initial rollout)..." | |
| # Wait briefly for deployments to be created by helm before checking rollout status | |
| sleep 10 | |
| kubectl get pods -n "$LLMD_NAMESPACE_B" | |
| - name: Deploy Model B WVA resources | |
| env: | |
| LLMD_NS: ${{ env.LLMD_NAMESPACE_B }} | |
| WVA_NS: ${{ env.WVA_NAMESPACE }} | |
| # Use same controller instance as Model A for HPA selector matching | |
| CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }} | |
| run: | | |
| echo "Deploying Model B WVA resources..." | |
| echo " Release name: $MODEL_B_RELEASE" | |
| echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE" | |
| # Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B | |
| # controller.enabled=false since we're using the existing WVA controller | |
| # Note: llmd.modelName should be base name without -decode suffix (template appends it) | |
| helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \ | |
| -n "$WVA_NAMESPACE" \ | |
| --set controller.enabled=false \ | |
| --set va.enabled=true \ | |
| --set hpa.enabled=true \ | |
| --set hpa.behavior.scaleUp.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \ | |
| --set hpa.behavior.scaleDown.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \ | |
| --set llmd.namespace="$LLMD_NAMESPACE_B" \ | |
| --set llmd.modelName="ms-inference-scheduling-llm-d-modelservice" \ | |
| --set llmd.modelID="$MODEL_ID" \ | |
| --set va.accelerator="$ACCELERATOR_TYPE" \ | |
| --set wva.baseName="inference-scheduling" \ | |
| --set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring \ | |
| --set wva.metrics.secure=false \ | |
| --set vllmService.port=8000 \ | |
| --set vllmService.targetPort=8000 \ | |
| --set wva.controllerInstance="$CONTROLLER_INSTANCE" | |
| echo "Model B WVA resources deployed" | |
| kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true | |
| kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true | |
| - name: Wait for Model B to be ready | |
| run: | | |
| # Same fix as Model A1: ensure replicas > 0 before waiting for rollout | |
| DESIRED_REPLICAS="${DECODE_REPLICAS:-1}" | |
| CURRENT_REPLICAS=$(kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") | |
| if [ "$CURRENT_REPLICAS" -eq 0 ]; then | |
| echo "WARNING: Model B deployment has 0 replicas (likely from previous failed run cleanup)" | |
| echo "Scaling to $DESIRED_REPLICAS replica(s)..." | |
| kubectl scale deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --replicas="$DESIRED_REPLICAS" || { | |
| echo "ERROR: Failed to scale Model B deployment" | |
| exit 1 | |
| } | |
| fi | |
| echo "Waiting for Model B vLLM deployment to be ready (up to 25 minutes for model loading)..." | |
| # Same as Model A1: use rollout status to wait for actual pod readiness. | |
| kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" --timeout=1500s || { | |
| echo "WARNING: Model B deployment not ready after 25 minutes" | |
| echo "=== Pod status ===" | |
| kubectl get pods -n "$LLMD_NAMESPACE_B" | |
| echo "=== Deployment conditions ===" | |
| kubectl get deployment ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE_B" -o jsonpath='{.status.conditions}' | jq . || true | |
| echo "=== Recent events ===" | |
| kubectl get events -n "$LLMD_NAMESPACE_B" --sort-by='.lastTimestamp' | tail -20 | |
| } | |
| - name: Verify multi-model deployment | |
| run: | | |
| echo "=== Multi-Model Deployment Status ===" | |
| echo "" | |
| echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ===" | |
| kubectl get deployment -n "$LLMD_NAMESPACE" | grep -E "decode|NAME" || true | |
| kubectl get hpa -n "$LLMD_NAMESPACE" || true | |
| kubectl get variantautoscaling -n "$LLMD_NAMESPACE" || true | |
| echo "" | |
| echo "=== Model B ($LLMD_NAMESPACE_B) ===" | |
| kubectl get deployment -n "$LLMD_NAMESPACE_B" | grep -E "decode|NAME" || true | |
| kubectl get hpa -n "$LLMD_NAMESPACE_B" || true | |
| kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" || true | |
| echo "" | |
| echo "=== WVA Controller ($WVA_NAMESPACE) ===" | |
| kubectl get pods -n "$WVA_NAMESPACE" | |
| - name: Verify metrics pipeline | |
| run: | | |
| echo "=== Verifying metrics pipeline before running tests ===" | |
| echo "" | |
| # 1. Verify vLLM pods are serving /metrics endpoint | |
| echo "--- Step 1: Checking vLLM /metrics endpoint ---" | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do | |
| VLLM_POD=$(kubectl get pods -n "$ns" -l llm-d.ai/inference-serving=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) | |
| if [ -n "$VLLM_POD" ]; then | |
| PORT="${VLLM_SVC_PORT:-8000}" | |
| echo " Checking vLLM pod $VLLM_POD in $ns (port $PORT)..." | |
| METRICS=$(kubectl exec -n "$ns" "$VLLM_POD" -- curl -s "http://localhost:${PORT}/metrics" 2>/dev/null | head -5 || true) | |
| if [ -n "$METRICS" ]; then | |
| echo " ✅ vLLM metrics endpoint responding in $ns" | |
| else | |
| echo " ⚠️ vLLM metrics endpoint not responding in $ns (may still be loading)" | |
| fi | |
| # Show pod labels for debugging | |
| echo " Pod labels:" | |
| kubectl get pod "$VLLM_POD" -n "$ns" -o jsonpath='{.metadata.labels}' | jq -r 'to_entries[] | " \(.key)=\(.value)"' 2>/dev/null || true | |
| else | |
| echo " ⚠️ No vLLM pods found with label llm-d.ai/inference-serving=true in $ns" | |
| echo " All pods in $ns:" | |
| kubectl get pods -n "$ns" --show-labels 2>/dev/null || true | |
| fi | |
| done | |
| # 1b. Verify vllm-service has endpoints (critical for ServiceMonitor scraping) | |
| echo "" | |
| echo "--- Step 1b: Checking vllm-service endpoints ---" | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do | |
| SVC_NAME=$(kubectl get svc -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) | |
| if [ -n "$SVC_NAME" ]; then | |
| ENDPOINTS=$(kubectl get endpoints "$SVC_NAME" -n "$ns" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null || true) | |
| if [ -n "$ENDPOINTS" ]; then | |
| echo " ✅ Service $SVC_NAME in $ns has endpoints: $ENDPOINTS" | |
| else | |
| echo " ❌ Service $SVC_NAME in $ns has NO endpoints — label selector mismatch!" | |
| echo " Service selector:" | |
| kubectl get svc "$SVC_NAME" -n "$ns" -o jsonpath='{.spec.selector}' 2>/dev/null | jq . || true | |
| fi | |
| else | |
| echo " ⚠️ No vllm-service found in $ns" | |
| fi | |
| done | |
| # 1c. Check PodMonitors (llm-d guide deploys these for direct pod scraping) | |
| echo "" | |
| echo "--- Step 1c: PodMonitor configuration ---" | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do | |
| PM_COUNT=$(kubectl get podmonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ') | |
| echo " PodMonitors in $ns: $PM_COUNT" | |
| kubectl get podmonitor -n "$ns" 2>/dev/null || true | |
| done | |
| # 2. Check WVA controller health | |
| echo "" | |
| echo "--- Step 2: WVA controller status ---" | |
| kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler | |
| WVA_POD=$(kubectl get pods -n "$WVA_NAMESPACE" -l app.kubernetes.io/name=workload-variant-autoscaler -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) | |
| if [ -n "$WVA_POD" ]; then | |
| echo " Recent WVA controller logs:" | |
| kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=20 | grep -E "reconcil|metrics|error|saturation" || echo " (no matching log lines)" | |
| fi | |
| # 3. Check VariantAutoscaling status | |
| echo "" | |
| echo "--- Step 3: VariantAutoscaling status ---" | |
| kubectl get variantautoscaling -A -o wide 2>/dev/null || echo " No VariantAutoscalings found" | |
| # 4. Check ServiceMonitors exist | |
| echo "" | |
| echo "--- Step 4: ServiceMonitor configuration ---" | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do | |
| SM_COUNT=$(kubectl get servicemonitor -n "$ns" --no-headers 2>/dev/null | wc -l | tr -d ' ') | |
| echo " ServiceMonitors in $ns: $SM_COUNT" | |
| kubectl get servicemonitor -n "$ns" 2>/dev/null || true | |
| done | |
| # 5. Wait for WVA to start processing metrics (up to 3 minutes) | |
| echo "" | |
| echo "--- Step 5: Waiting for WVA to detect metrics (up to 3 minutes) ---" | |
| METRICS_READY=false | |
| for i in $(seq 1 18); do | |
| VA_STATUS=$(kubectl get variantautoscaling -n "$LLMD_NAMESPACE" -o jsonpath='{.items[0].status.desiredOptimizedAlloc.accelerator}' 2>/dev/null || true) | |
| if [ -n "$VA_STATUS" ]; then | |
| echo " ✅ WVA optimization active — accelerator: $VA_STATUS" | |
| METRICS_READY=true | |
| break | |
| fi | |
| echo " Attempt $i/18: WVA not yet optimizing, waiting 10s..." | |
| sleep 10 | |
| done | |
| if [ "$METRICS_READY" = "false" ]; then | |
| echo " ⚠️ WVA has not started optimizing after 3 minutes" | |
| echo " This may cause test timeouts — dumping diagnostics:" | |
| echo "" | |
| echo " === WVA controller logs (last 50 lines) ===" | |
| kubectl logs "$WVA_POD" -n "$WVA_NAMESPACE" --tail=50 2>/dev/null || true | |
| echo "" | |
| echo " === HPA status ===" | |
| kubectl get hpa -A 2>/dev/null || true | |
| echo "" | |
| echo " Continuing to tests anyway (they have their own timeouts)..." | |
| fi | |
| echo "" | |
| echo "=== Metrics pipeline verification complete ===" | |
| - name: Install Go dependencies | |
| run: | | |
| GOTOOLCHAIN=auto go version | |
| GOTOOLCHAIN=auto go env GOTOOLCHAIN | |
| GOTOOLCHAIN=auto go mod download | |
| - name: Run OpenShift E2E tests | |
| env: | |
| # Consolidated e2e test environment variables | |
| ENVIRONMENT: openshift | |
| USE_SIMULATOR: "true" | |
| SCALE_TO_ZERO_ENABLED: "true" | |
| WVA_NAMESPACE: ${{ env.WVA_NAMESPACE }} | |
| MONITORING_NAMESPACE: openshift-user-workload-monitoring | |
| LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }} | |
| # Legacy variables for backward compatibility (if needed by tests) | |
| CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }} | |
| # Multi-model testing: secondary namespace for Model B | |
| LLMD_NAMESPACE_B: ${{ env.LLMD_NAMESPACE_B }} | |
| GATEWAY_NAME: infra-inference-scheduling-inference-gateway-istio | |
| DEPLOYMENT: ms-inference-scheduling-llm-d-modelservice-decode | |
| # Pass WVA_RELEASE_NAME so test can filter for current run's resources | |
| WVA_RELEASE_NAME: ${{ env.WVA_RELEASE_NAME }} | |
| # Controller instance label must match what the controller was deployed with | |
| CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }} | |
| MODEL_ID: ${{ env.MODEL_ID }} | |
| REQUEST_RATE: ${{ env.REQUEST_RATE }} | |
| NUM_PROMPTS: ${{ env.NUM_PROMPTS }} | |
| run: | | |
| echo "Running consolidated E2E tests on OpenShift with configuration:" | |
| echo " ENVIRONMENT: $ENVIRONMENT" | |
| echo " USE_SIMULATOR: $USE_SIMULATOR" | |
| echo " SCALE_TO_ZERO_ENABLED: $SCALE_TO_ZERO_ENABLED" | |
| echo " WVA_NAMESPACE: $WVA_NAMESPACE" | |
| echo " MONITORING_NAMESPACE: $MONITORING_NAMESPACE" | |
| echo " LLMD_NAMESPACE: $LLMD_NAMESPACE" | |
| echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)" | |
| echo " DEPLOYMENT: $DEPLOYMENT" | |
| echo " GATEWAY_NAME: $GATEWAY_NAME" | |
| echo " MODEL_ID: $MODEL_ID" | |
| echo " REQUEST_RATE: $REQUEST_RATE" | |
| echo " NUM_PROMPTS: $NUM_PROMPTS" | |
| echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME" | |
| make test-e2e-full | |
| - name: Cleanup infrastructure | |
| # Cleanup on success or cancellation, but NOT on failure (preserve for debugging) | |
| # Use SKIP_CLEANUP=true to keep resources after successful runs | |
| if: (success() || cancelled()) && env.SKIP_CLEANUP != 'true' | |
| run: | | |
| echo "Cleaning up ALL test infrastructure..." | |
| echo " LLMD_NAMESPACE: $LLMD_NAMESPACE" | |
| echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B" | |
| echo " WVA_NAMESPACE: $WVA_NAMESPACE" | |
| echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME" | |
| echo " MODEL_B_RELEASE: $MODEL_B_RELEASE" | |
| # Uninstall all WVA helm releases before deleting namespaces | |
| # This ensures proper cleanup of resources and removes helm tracking | |
| echo "Uninstalling WVA helm releases..." | |
| helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true | |
| helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true | |
| echo "Uninstalling llm-d helm releases in primary namespace..." | |
| for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do | |
| echo " Uninstalling release: $release" | |
| helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| echo "Uninstalling llm-d helm releases in secondary namespace..." | |
| for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do | |
| echo " Uninstalling release: $release" | |
| helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| # Delete all PR-specific namespaces | |
| echo "Deleting llm-d namespace $LLMD_NAMESPACE..." | |
| kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true | |
| echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..." | |
| kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s || true | |
| echo "Deleting WVA namespace $WVA_NAMESPACE..." | |
| kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true | |
| # Clean up cluster-scoped WVA resources for THIS release only | |
| # Use both name and instance labels to avoid deleting resources from other PRs | |
| echo "Removing cluster-scoped WVA resources for release $WVA_RELEASE_NAME..." | |
| kubectl delete clusterrole,clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler,app.kubernetes.io/instance="$WVA_RELEASE_NAME" --ignore-not-found || true | |
| # Also clean up cluster-scoped resources owned by this PR's namespaces | |
| # (covers helmfile-created resources whose instance label differs from WVA_RELEASE_NAME) | |
| for kind in clusterrole clusterrolebinding; do | |
| kubectl get "$kind" -o json 2>/dev/null | \ | |
| jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | "\(.metadata.name)\t\(.metadata.annotations["meta.helm.sh/release-namespace"] // "")"' 2>/dev/null | \ | |
| while IFS=$'\t' read -r name ns; do | |
| if [ "$ns" = "$LLMD_NAMESPACE" ] || [ "$ns" = "$LLMD_NAMESPACE_B" ] || [ "$ns" = "$WVA_NAMESPACE" ]; then | |
| echo " Deleting $kind/$name (owned by PR namespace '$ns')" | |
| kubectl delete "$kind" "$name" --ignore-not-found || true | |
| fi | |
| done | |
| done | |
| echo "Cleanup complete" | |
| - name: Dump cluster state | |
| if: always() | |
| run: | | |
| echo "=== Dumping cluster state for diagnostics ===" | |
| echo "" | |
| echo "=== VAs ===" | |
| kubectl get va -n "$LLMD_NAMESPACE" 2>/dev/null || true | |
| kubectl get va -n "$LLMD_NAMESPACE_B" 2>/dev/null || true | |
| echo "" | |
| echo "=== HPAs ===" | |
| kubectl get hpa -n "$LLMD_NAMESPACE" 2>/dev/null || true | |
| kubectl get hpa -n "$LLMD_NAMESPACE_B" 2>/dev/null || true | |
| echo "" | |
| echo "=== Controller pods ===" | |
| kubectl get pods -n "$WVA_NAMESPACE" 2>/dev/null || true | |
| echo "" | |
| echo "=== All resources ===" | |
| for ns in "$WVA_NAMESPACE" "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do | |
| if kubectl get namespace "$ns" &>/dev/null; then | |
| echo "--- Namespace: $ns ---" | |
| kubectl get all -n "$ns" 2>/dev/null || true | |
| echo "" | |
| echo "--- Events in $ns ---" | |
| kubectl get events -n "$ns" --sort-by='.lastTimestamp' 2>/dev/null | tail -20 || true | |
| echo "" | |
| fi | |
| done | |
| - name: Scale down GPU workloads on failure | |
| # On failure, scale down decode deployments to free GPUs while preserving | |
| # other resources (VA, HPA, controller, gateway) for debugging | |
| if: failure() | |
| run: | | |
| echo "Test failed - scaling down decode deployments to free GPUs..." | |
| echo "Other resources (VA, HPA, controller logs) are preserved for debugging" | |
| echo "" | |
| for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do | |
| if kubectl get namespace "$ns" &>/dev/null; then | |
| echo "=== Scaling down decode deployments in $ns ===" | |
| kubectl scale deployment -n "$ns" -l llm-d.ai/inferenceServing=true --replicas=0 || true | |
| # Also try by name pattern in case labels are missing | |
| kubectl get deployment -n "$ns" -o name 2>/dev/null | grep decode | while read -r deploy; do | |
| echo " Scaling down: $deploy" | |
| kubectl scale "$deploy" -n "$ns" --replicas=0 || true | |
| done | |
| fi | |
| done | |
| # Report status back to PR for issue_comment triggered runs | |
| # This ensures fork PRs show the correct status after /ok-to-test runs complete | |
| report-status: | |
| runs-on: ubuntu-latest | |
| needs: [gate, e2e-openshift] | |
| # Run always (even on failure) but only for issue_comment events | |
| if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true' | |
| steps: | |
| - name: Report status to PR | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}'; | |
| const e2eResult = '${{ needs.e2e-openshift.result }}'; | |
| // Map job result to commit status | |
| let state, description; | |
| if (e2eResult === 'success') { | |
| state = 'success'; | |
| description = 'E2E tests passed'; | |
| } else if (e2eResult === 'skipped') { | |
| state = 'pending'; | |
| description = 'E2E tests skipped'; | |
| } else if (e2eResult === 'cancelled') { | |
| state = 'failure'; | |
| description = 'E2E tests cancelled'; | |
| } else { | |
| state = 'failure'; | |
| description = 'E2E tests failed'; | |
| } | |
| console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`); | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: prHeadSha, | |
| state: state, | |
| target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, | |
| description: description, | |
| context: '${{ github.workflow }} / e2e (comment trigger)' | |
| }); | |
| console.log('Status reported successfully'); |