Skip to content

Revert "exp data"

Revert "exp data" #1

name: CI - OpenShift E2E Tests

Check failure on line 1 in .github/workflows/ci-e2e-openshift.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/ci-e2e-openshift.yaml

Invalid workflow file

(Line: 479, Col: 11): 'BENCHMARK_MODE' is already defined, (Line: 536, Col: 11): 'BENCHMARK_MODE' is already defined
# Permissions needed for various jobs
permissions:
contents: read
packages: write
pull-requests: write # For posting comments on PRs
statuses: write # For reporting status on fork PR commits
# Cancel previous runs on the same PR to avoid resource conflicts
# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments)
# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
#
# Logic:
# - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests
# - Valid triggers: group 'e2e-openshift-{pr_number}' (can cancel previous runs for same PR)
# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
#
# NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation (line ~125)
concurrency:
group: >-
${{
github.event_name == 'issue_comment' &&
!contains(github.event.comment.body, '/ok-to-test') &&
!contains(github.event.comment.body, '/retest')
&& format('comment-isolated-{0}', github.run_id)
|| format('e2e-openshift-{0}',
github.event.pull_request.number
|| github.event.issue.number
|| github.run_id)
}}
cancel-in-progress: true
on:
pull_request:
branches:
- main
- dev
# Allow maintainers to trigger tests on fork PRs via /ok-to-test comment
issue_comment:
types: [created]
workflow_dispatch:
inputs:
model_id:
description: 'Model ID'
required: false
default: 'unsloth/Meta-Llama-3.1-8B'
accelerator_type:
description: 'Accelerator type (H100, A100, L40S)'
required: false
default: 'H100'
request_rate:
description: 'Request rate (req/s)'
required: false
default: '20'
num_prompts:
description: 'Number of prompts'
required: false
default: '3000'
skip_cleanup:
description: 'Skip cleanup after tests'
required: false
default: 'false'
max_num_seqs:
description: 'vLLM max batch size (lower = easier to saturate)'
required: false
default: '1'
hpa_stabilization_seconds:
description: 'HPA stabilization window in seconds'
required: false
default: '30'
jobs:
# Gate: Check permissions and handle /ok-to-test for fork PRs
# - Maintainers (write access): Tests run automatically
# - External contributors: Must wait for maintainer to comment /ok-to-test
gate:
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.check.outputs.should_run }}
pr_number: ${{ steps.check.outputs.pr_number }}
pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
is_fork_pr: ${{ steps.check.outputs.is_fork_pr }}
steps:
- name: Check permissions and /ok-to-test
id: check
uses: actions/github-script@v7
with:
script: |
// Helper to check if user has write access
async function hasWriteAccess(username) {
try {
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: username
});
const privilegedRoles = ['admin', 'maintain', 'write'];
return privilegedRoles.includes(permission.permission);
} catch (e) {
console.log(`Could not get permissions for ${username}: ${e.message}`);
return false;
}
}
// Always run for workflow_dispatch
if (context.eventName === 'workflow_dispatch') {
core.setOutput('should_run', 'true');
core.setOutput('pr_number', '');
core.setOutput('pr_head_sha', context.sha);
core.setOutput('is_fork_pr', 'false');
return;
}
// Handle issue_comment event (/ok-to-test or /retest)
if (context.eventName === 'issue_comment') {
const comment = context.payload.comment.body.trim();
const issue = context.payload.issue;
// Only process /ok-to-test or /retest comments on PRs
if (!issue.pull_request) {
console.log('Comment is not on a PR, skipping');
core.setOutput('should_run', 'false');
return;
}
// NOTE: This list must stay in sync with concurrency group logic (lines 23-25)
const validCommands = ['/ok-to-test', '/retest'];
if (!validCommands.includes(comment)) {
console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
core.setOutput('should_run', 'false');
return;
}
// Check if commenter has write access
const commenter = context.payload.comment.user.login;
const hasAccess = await hasWriteAccess(commenter);
if (!hasAccess) {
console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
core.setOutput('should_run', 'false');
return;
}
// Get PR details to get head SHA
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: issue.number
});
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
console.log(`PR head SHA: ${pr.head.sha}`);
console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('should_run', 'true');
core.setOutput('pr_number', issue.number.toString());
core.setOutput('pr_head_sha', pr.head.sha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Add reaction to acknowledge
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'rocket'
});
// Post comment with link to the e2e workflow run
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issue.number,
body: `🚀 **E2E tests triggered by ${comment}**\n\n[View the OpenShift E2E workflow run](${runUrl})`
});
return;
}
// Handle pull_request event
const pr = context.payload.pull_request;
const prAuthor = pr.user.login;
const prNumber = pr.number;
const prHeadSha = pr.head.sha;
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('pr_number', prNumber.toString());
core.setOutput('pr_head_sha', prHeadSha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Check if PR author has write access
const isPrivileged = await hasWriteAccess(prAuthor);
console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`);
// Check if we already posted a bot comment
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber
});
const botComment = comments.data.find(c =>
c.user.type === 'Bot' &&
c.body.includes('ok-to-test')
);
// Helper to safely post a comment (may fail on fork PRs due to permissions)
async function tryPostComment(body) {
try {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
return true;
} catch (e) {
// Fork PRs can't post comments on pull_request event (GitHub security restriction)
console.log(`Could not post comment (expected for fork PRs): ${e.message}`);
return false;
}
}
if (isPrivileged) {
// For maintainer/admin fork PRs, we need to trigger via /ok-to-test
// because fork PRs don't have access to secrets on pull_request event
if (isFork) {
console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`);
core.setOutput('should_run', 'false'); // Don't run on pull_request event
// Auto-post /ok-to-test to trigger issue_comment workflow
if (!botComment) {
const posted = await tryPostComment(`/ok-to-test`);
if (!posted) {
console.log('Note: Maintainer will need to manually comment /ok-to-test');
}
}
return;
}
// Non-fork PR from maintainer - run directly
core.setOutput('should_run', 'true');
return;
}
// External contributor - post instructions and skip
console.log('External contributor PR - posting instructions');
core.setOutput('should_run', 'false');
if (!botComment) {
const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so the e2e tests require approval to run (they use GPU resources).\n\n**For maintainers/admins:** Comment \`/ok-to-test\` to trigger the e2e tests after reviewing the code.\n\n**For contributors:** Please wait for a maintainer or admin to approve running the tests.`);
if (!posted) {
console.log('Note: Could not post instructions comment on fork PR');
}
}
# Build the WVA controller image on GitHub-hosted runner (has proper Docker setup)
# Note: Skip for fork PRs on pull_request event (no secrets access).
# For fork PRs, build-image runs via issue_comment trigger (/ok-to-test).
build-image:
needs: gate
if: |
needs.gate.outputs.should_run == 'true' &&
(needs.gate.outputs.is_fork_pr != 'true' || github.event_name != 'pull_request')
runs-on: ubuntu-latest
outputs:
image_tag: ${{ steps.build.outputs.image_tag }}
steps:
- name: Checkout source
uses: actions/checkout@v4
with:
# Use PR head SHA from gate (works for both pull_request and issue_comment)
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ secrets.CR_USER }}
password: ${{ secrets.CR_TOKEN }}
- name: Build and push image
id: build
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
# Use PR head SHA from gate
GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
run: |
# Build image with git ref tag for this PR
# Use first 8 chars of the git ref (POSIX-compliant)
IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" | cut -c1-8)"
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
echo "Building image: $FULL_IMAGE"
echo "Git ref: $GIT_REF"
# Build and push using make targets
make docker-build IMG="$FULL_IMAGE"
make docker-push IMG="$FULL_IMAGE"
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "Image built and pushed: $FULL_IMAGE"
# Run e2e tests on OpenShift self-hosted runner
e2e-openshift:
runs-on: [self-hosted, openshift]
needs: [gate, build-image]
if: needs.gate.outputs.should_run == 'true'
env:
MODEL_ID: ${{ github.event.inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }}
ACCELERATOR_TYPE: ${{ github.event.inputs.accelerator_type || 'A100' }}
REQUEST_RATE: ${{ github.event.inputs.request_rate || '20' }}
NUM_PROMPTS: ${{ github.event.inputs.num_prompts || '3000' }}
MAX_NUM_SEQS: ${{ github.event.inputs.max_num_seqs || '1' }}
HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '30' }}
SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
# Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support)
LLM_D_RELEASE: main
# PR-specific namespaces for isolation between concurrent PR tests
# Primary llm-d namespace (Model A1 + A2)
LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Secondary llm-d namespace (Model B)
LLMD_NAMESPACE_B: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}-b
# WVA controller namespace (monitors all models)
WVA_NAMESPACE: llm-d-autoscaler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Unique release names per run to avoid conflicts
WVA_RELEASE_NAME: wva-e2e-${{ github.run_id }}
# Model A1: Primary deployment in LLMD_NAMESPACE
MODEL_A1_RELEASE: model-a1-${{ github.run_id }}
# Model B: Deployment in LLMD_NAMESPACE_B
MODEL_B_RELEASE: model-b-${{ github.run_id }}
# Use the image built in the previous job
WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
steps:
- name: Checkout source
uses: actions/checkout@v4
with:
# Use PR head SHA from gate (works for both pull_request and issue_comment)
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Extract Go version from go.mod
run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "${{ env.GO_VERSION }}"
cache-dependency-path: ./go.sum
- name: Install tools (kubectl, oc, helm, make)
run: |
sudo apt-get update && sudo apt-get install -y make
# Install kubectl - use pinned version for reproducible CI builds
# Pinned 2025-12: v1.31.0 tested compatible with OpenShift 4.16+
# Update this version when upgrading target cluster or during regular dependency reviews
KUBECTL_VERSION="v1.31.0"
echo "Installing kubectl version: $KUBECTL_VERSION"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256"
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
rm -f kubectl.sha256
# Install oc (OpenShift CLI)
curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
tar -xzf openshift-client-linux.tar.gz
sudo mv oc /usr/local/bin/
rm -f openshift-client-linux.tar.gz kubectl README.md
# Install helm
curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: Verify cluster access
run: |
echo "Verifying cluster access..."
kubectl cluster-info
kubectl get nodes
- name: Get HF token from cluster secret
id: hf-token
run: |
echo "Reading HF token from cluster secret llm-d-hf-token in default namespace..."
# The llm-d-hf-token secret exists in the default namespace on the cluster
# Check secret existence separately from key retrieval for better error messages
if ! kubectl get secret llm-d-hf-token -n default &>/dev/null; then
echo "::error::Secret 'llm-d-hf-token' not found in default namespace"
echo "::error::Please ensure the HF token secret exists on the cluster"
exit 1
fi
# Read the token and mask it in logs
HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' | base64 -d)
if [ -z "$HF_TOKEN" ]; then
echo "::error::Secret 'llm-d-hf-token' exists but 'HF_TOKEN' key is empty or missing"
exit 1
fi
# Mask the token in workflow logs
echo "::add-mask::$HF_TOKEN"
# Export for subsequent steps
echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV
echo "HF token retrieved successfully from cluster secret"
- name: Clean up resources for this PR
run: |
echo "Cleaning up WVA resources for this PR's namespaces only..."
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
# Only clean up the 3 namespaces associated with THIS PR
# Do NOT touch namespaces from other PRs to avoid race conditions
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B" "$WVA_NAMESPACE"; do
if kubectl get namespace "$ns" &>/dev/null; then
echo ""
echo "=== Cleaning up namespace: $ns ==="
# Delete WVA resources in this namespace
echo " Removing HPAs and VAs..."
kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
# Uninstall all helm releases in the namespace
for release in $(helm list -n "$ns" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s || true
done
echo " Deleting namespace: $ns"
kubectl delete namespace "$ns" --ignore-not-found --timeout=60s || true
else
echo "Namespace $ns does not exist, skipping cleanup"
fi
done
# Clean up legacy namespaces if they exist (these are not PR-specific)
for legacy_ns in llm-d-inference-scheduler workload-variant-autoscaler-system; do
if kubectl get namespace "$legacy_ns" &>/dev/null; then
echo ""
echo "=== Cleaning up legacy namespace: $legacy_ns ==="
# Uninstall all helm releases in the namespace first
for release in $(helm list -n "$legacy_ns" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$legacy_ns" --ignore-not-found --wait --timeout 60s || true
done
echo " Deleting namespace: $legacy_ns"
kubectl delete namespace "$legacy_ns" --ignore-not-found --timeout=60s || true
fi
done
echo ""
echo "Cleanup complete for this PR's namespaces"
- name: Apply latest CRDs
run: |
echo "Applying latest VariantAutoscaling CRD..."
# Helm doesn't auto-update CRDs, so we need to apply them manually
# to ensure the cluster has the latest schema (including scaleTargetRef)
kubectl apply -f charts/workload-variant-autoscaler/crds/
- name: Deploy WVA and llm-d infrastructure
env:
# HF_TOKEN is inherited from GITHUB_ENV (set in 'Get HF token from cluster secret' step)
ENVIRONMENT: openshift
INSTALL_GATEWAY_CTRLPLANE: "false"
# Disable benchmark mode - istioBench environment not available in llm-d helmfile
BENCHMARK_MODE: "false"
E2E_TESTS_ENABLED: "true"
NAMESPACE_SCOPED: "false"
# Pass PR-specific namespaces to install script
LLMD_NS: ${{ env.LLMD_NAMESPACE }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Controller instance label for multi-controller isolation in parallel e2e tests
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
# Disable benchmark mode - istioBench environment not available in llm-d helmfile
BENCHMARK_MODE: "false"
run: |
echo "Deploying WVA and llm-d infrastructure..."
echo " MODEL_ID: $MODEL_ID"
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
echo " LLMD_NS: $LLMD_NS"
echo " WVA_NS: $WVA_NS"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
echo " HF token configuration: ✓"
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
- name: Create secondary namespace for Model B
run: |
echo "Creating secondary namespace for Model B..."
kubectl create namespace "$LLMD_NAMESPACE_B" --dry-run=client -o yaml | kubectl apply -f -
echo "Secondary namespace $LLMD_NAMESPACE_B created"
- name: Label namespaces for OpenShift monitoring
run: |
echo "Adding openshift.io/user-monitoring label to namespaces for Prometheus scraping..."
kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
kubectl label namespace "$LLMD_NAMESPACE_B" openshift.io/user-monitoring=true --overwrite
kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
echo "Namespace labels applied"
- name: Wait for infrastructure to be ready
run: |
echo "Waiting for WVA controller to be ready..."
kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" || true
kubectl get pods -n "$WVA_NAMESPACE"
echo "Waiting for llm-d deployment (Model A1) to be ready..."
kubectl get pods -n "$LLMD_NAMESPACE"
- name: Deploy Model B infrastructure in secondary namespace
env:
# HF_TOKEN is inherited from GITHUB_ENV
ENVIRONMENT: openshift
INSTALL_GATEWAY_CTRLPLANE: "false"
# Disable benchmark mode - istioBench environment not available in llm-d helmfile
BENCHMARK_MODE: "false"
E2E_TESTS_ENABLED: "true"
NAMESPACE_SCOPED: "false"
# Override namespaces for Model B stack
LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Skip WVA controller and prometheus (use existing)
DEPLOY_WVA: "false"
DEPLOY_PROMETHEUS: "false"
DEPLOY_PROMETHEUS_ADAPTER: "false"
DEPLOY_VA: "false"
DEPLOY_HPA: "false"
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
# Disable benchmark mode - istioBench environment not available in llm-d helmfile
BENCHMARK_MODE: "false"
run: |
echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
echo " MODEL_ID: $MODEL_ID"
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
# Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
echo "Waiting for Model B deployment to be ready..."
kubectl wait --for=condition=available --timeout=300s deployment --all -n "$LLMD_NAMESPACE_B" || true
kubectl get pods -n "$LLMD_NAMESPACE_B"
- name: Deploy Model B WVA resources
env:
LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
# Use same controller instance as Model A for HPA selector matching
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
run: |
echo "Deploying Model B WVA resources..."
echo " Release name: $MODEL_B_RELEASE"
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
# Deploy WVA resources (VA, HPA, ServiceMonitor) for Model B
# controller.enabled=false since we're using the existing WVA controller
# Note: llmd.modelName should be base name without -decode suffix (template appends it)
helm upgrade -i "$MODEL_B_RELEASE" ./charts/workload-variant-autoscaler \
-n "$WVA_NAMESPACE" \
--set controller.enabled=false \
--set va.enabled=true \
--set hpa.enabled=true \
--set hpa.behavior.scaleUp.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
--set hpa.behavior.scaleDown.stabilizationWindowSeconds="$HPA_STABILIZATION_SECONDS" \
--set llmd.namespace="$LLMD_NAMESPACE_B" \
--set llmd.modelName="ms-inference-scheduling-llm-d-modelservice" \
--set llmd.modelID="$MODEL_ID" \
--set va.accelerator="$ACCELERATOR_TYPE" \
--set wva.baseName="inference-scheduling" \
--set wva.prometheus.monitoringNamespace=openshift-user-workload-monitoring \
--set wva.controllerInstance="$CONTROLLER_INSTANCE"
echo "Model B WVA resources deployed"
kubectl get hpa -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" -l app.kubernetes.io/instance="$MODEL_B_RELEASE" || true
- name: Verify multi-model deployment
run: |
echo "=== Multi-Model Deployment Status ==="
echo ""
echo "=== Model A1 (Primary, $LLMD_NAMESPACE) ==="
kubectl get deployment -n "$LLMD_NAMESPACE" | grep -E "decode|NAME" || true
kubectl get hpa -n "$LLMD_NAMESPACE" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE" || true
echo ""
echo "=== Model B ($LLMD_NAMESPACE_B) ==="
kubectl get deployment -n "$LLMD_NAMESPACE_B" | grep -E "decode|NAME" || true
kubectl get hpa -n "$LLMD_NAMESPACE_B" || true
kubectl get variantautoscaling -n "$LLMD_NAMESPACE_B" || true
echo ""
echo "=== WVA Controller ($WVA_NAMESPACE) ==="
kubectl get pods -n "$WVA_NAMESPACE"
- name: Install Go dependencies
run: go mod download
- name: Run OpenShift E2E tests
env:
CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
MONITORING_NAMESPACE: openshift-user-workload-monitoring
LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
# Multi-model testing: secondary namespace for Model B
LLMD_NAMESPACE_B: ${{ env.LLMD_NAMESPACE_B }}
GATEWAY_NAME: infra-inference-scheduling-inference-gateway-istio
DEPLOYMENT: ms-inference-scheduling-llm-d-modelservice-decode
# Pass WVA_RELEASE_NAME so test can filter for current run's resources
WVA_RELEASE_NAME: ${{ env.WVA_RELEASE_NAME }}
run: |
echo "Running OpenShift E2E tests with configuration:"
echo " CONTROLLER_NAMESPACE: $CONTROLLER_NAMESPACE"
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B (multi-model)"
echo " DEPLOYMENT: $DEPLOYMENT"
echo " GATEWAY_NAME: $GATEWAY_NAME"
echo " MODEL_ID: $MODEL_ID"
echo " REQUEST_RATE: $REQUEST_RATE"
echo " NUM_PROMPTS: $NUM_PROMPTS"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
make test-e2e-openshift
- name: Cleanup infrastructure
# Cleanup on success or cancellation, but NOT on failure (preserve for debugging)
# Use SKIP_CLEANUP=true to keep resources after successful runs
if: (success() || cancelled()) && env.SKIP_CLEANUP != 'true'
run: |
echo "Cleaning up ALL test infrastructure..."
echo " LLMD_NAMESPACE: $LLMD_NAMESPACE"
echo " LLMD_NAMESPACE_B: $LLMD_NAMESPACE_B"
echo " WVA_NAMESPACE: $WVA_NAMESPACE"
echo " WVA_RELEASE_NAME: $WVA_RELEASE_NAME"
echo " MODEL_B_RELEASE: $MODEL_B_RELEASE"
# Uninstall all WVA helm releases before deleting namespaces
# This ensures proper cleanup of resources and removes helm tracking
echo "Uninstalling WVA helm releases..."
helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
helm uninstall "$MODEL_B_RELEASE" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
echo "Uninstalling llm-d helm releases in primary namespace..."
for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
echo " Uninstalling release: $release"
helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
done
echo "Uninstalling llm-d helm releases in secondary namespace..."
for release in $(helm list -n "$LLMD_NAMESPACE_B" -q 2>/dev/null); do
echo " Uninstalling release: $release"
helm uninstall "$release" -n "$LLMD_NAMESPACE_B" --ignore-not-found --wait --timeout 60s || true
done
# Delete all PR-specific namespaces
echo "Deleting llm-d namespace $LLMD_NAMESPACE..."
kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true
echo "Deleting llm-d namespace $LLMD_NAMESPACE_B..."
kubectl delete namespace "$LLMD_NAMESPACE_B" --ignore-not-found --timeout=120s || true
echo "Deleting WVA namespace $WVA_NAMESPACE..."
kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true
# Clean up cluster-scoped WVA resources for THIS release only
# Use both name and instance labels to avoid deleting resources from other PRs
echo "Removing cluster-scoped WVA resources for release $WVA_RELEASE_NAME..."
kubectl delete clusterrole,clusterrolebinding -l app.kubernetes.io/name=workload-variant-autoscaler,app.kubernetes.io/instance="$WVA_RELEASE_NAME" --ignore-not-found || true
echo "Cleanup complete"
- name: Scale down GPU workloads on failure
# On failure, scale down decode deployments to free GPUs while preserving
# other resources (VA, HPA, controller, gateway) for debugging
if: failure()
run: |
echo "Test failed - scaling down decode deployments to free GPUs..."
echo "Other resources (VA, HPA, controller logs) are preserved for debugging"
echo ""
for ns in "$LLMD_NAMESPACE" "$LLMD_NAMESPACE_B"; do
if kubectl get namespace "$ns" &>/dev/null; then
echo "=== Scaling down decode deployments in $ns ==="
kubectl scale deployment -n "$ns" -l llm-d.ai/inferenceServing=true --replicas=0 || true
# Also try by name pattern in case labels are missing
kubectl get deployment -n "$ns" -o name 2>/dev/null | grep decode | while read -r deploy; do
echo " Scaling down: $deploy"
kubectl scale "$deploy" -n "$ns" --replicas=0 || true
done
fi
done
echo ""
echo "GPU workloads scaled down. Remaining resources for debugging:"
echo ""
echo "=== VAs ==="
kubectl get va -n "$LLMD_NAMESPACE" 2>/dev/null || true
kubectl get va -n "$LLMD_NAMESPACE_B" 2>/dev/null || true
echo ""
echo "=== HPAs ==="
kubectl get hpa -n "$LLMD_NAMESPACE" 2>/dev/null || true
kubectl get hpa -n "$LLMD_NAMESPACE_B" 2>/dev/null || true
echo ""
echo "=== Controller pods ==="
kubectl get pods -n "$WVA_NAMESPACE" 2>/dev/null || true
# Report status back to PR for issue_comment triggered runs
# This ensures fork PRs show the correct status after /ok-to-test runs complete
report-status:
runs-on: ubuntu-latest
needs: [gate, e2e-openshift]
# Run always (even on failure) but only for issue_comment events
if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true'
steps:
- name: Report status to PR
uses: actions/github-script@v7
with:
script: |
const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
const e2eResult = '${{ needs.e2e-openshift.result }}';
// Map job result to commit status
let state, description;
if (e2eResult === 'success') {
state = 'success';
description = 'E2E tests passed';
} else if (e2eResult === 'skipped') {
state = 'pending';
description = 'E2E tests skipped';
} else if (e2eResult === 'cancelled') {
state = 'failure';
description = 'E2E tests cancelled';
} else {
state = 'failure';
description = 'E2E tests failed';
}
console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: prHeadSha,
state: state,
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: description,
context: '${{ github.workflow }} / e2e (comment trigger)'
});
console.log('Status reported successfully');