Skip to content

🌱 Hot fix to e2e test on Openshift #587

🌱 Hot fix to e2e test on Openshift

🌱 Hot fix to e2e test on Openshift #587

name: CI - OpenShift E2E Tests
# Permissions needed for various jobs
permissions:
contents: read
pull-requests: write # For posting comments on PRs
statuses: write # For reporting status on fork PR commits
# Cancel previous runs on the same PR to avoid resource conflicts
# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments)
# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
#
# Logic:
# - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests
# - Valid triggers: group 'fma-e2e-openshift-{pr_number}' (can cancel previous runs for same PR)
# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
#
# NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation
concurrency:
group: >-
${{
github.event_name == 'issue_comment' &&
!contains(github.event.comment.body, '/ok-to-test') &&
!contains(github.event.comment.body, '/retest')
&& format('comment-isolated-{0}', github.run_id)
|| format('fma-e2e-openshift-{0}',
github.event.pull_request.number
|| github.event.issue.number
|| github.run_id)
}}
cancel-in-progress: true
on:
pull_request:
branches:
- main
# Allow maintainers to trigger tests on fork PRs via /ok-to-test comment
issue_comment:
types: [created]
workflow_dispatch:
inputs:
skip_cleanup:
description: 'Skip cleanup after tests'
required: false
default: 'false'
jobs:
# Gate: Check permissions and handle /ok-to-test for fork PRs
# - Maintainers (write access): Tests run automatically
# - External contributors: Must wait for maintainer to comment /ok-to-test
gate:
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.check.outputs.should_run }}
pr_number: ${{ steps.check.outputs.pr_number }}
pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
is_fork_pr: ${{ steps.check.outputs.is_fork_pr }}
steps:
- name: Check permissions and /ok-to-test
id: check
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Helper to check if user has write access
async function hasWriteAccess(username) {
try {
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: username
});
const privilegedRoles = ['admin', 'maintain', 'write'];
return privilegedRoles.includes(permission.permission);
} catch (e) {
console.log(`Could not get permissions for ${username}: ${e.message}`);
return false;
}
}
// Always run for workflow_dispatch
if (context.eventName === 'workflow_dispatch') {
core.setOutput('should_run', 'true');
core.setOutput('pr_number', '');
core.setOutput('pr_head_sha', context.sha);
core.setOutput('is_fork_pr', 'false');
return;
}
// Handle issue_comment event (/ok-to-test or /retest)
if (context.eventName === 'issue_comment') {
const comment = context.payload.comment.body.trim();
const issue = context.payload.issue;
// Only process /ok-to-test or /retest comments on PRs
if (!issue.pull_request) {
console.log('Comment is not on a PR, skipping');
core.setOutput('should_run', 'false');
return;
}
// NOTE: This list must stay in sync with concurrency group logic
const validCommands = ['/ok-to-test', '/retest'];
if (!validCommands.includes(comment)) {
console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
core.setOutput('should_run', 'false');
return;
}
// Check if commenter has write access
const commenter = context.payload.comment.user.login;
const hasAccess = await hasWriteAccess(commenter);
if (!hasAccess) {
console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
core.setOutput('should_run', 'false');
return;
}
// Get PR details to get head SHA
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: issue.number
});
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
console.log(`PR head SHA: ${pr.head.sha}`);
console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('should_run', 'true');
core.setOutput('pr_number', issue.number.toString());
core.setOutput('pr_head_sha', pr.head.sha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Add reaction to acknowledge
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'rocket'
});
// Post comment with link to the e2e workflow run
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issue.number,
body: `🚀 **E2E tests triggered by ${comment}**\n\n[View the OpenShift E2E workflow run](${runUrl})`
});
return;
}
// Handle pull_request event
const pr = context.payload.pull_request;
const prAuthor = pr.user.login;
const prNumber = pr.number;
const prHeadSha = pr.head.sha;
// Check if PR is from a fork
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
const isFork = headRepo !== baseRepo;
console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
core.setOutput('pr_number', prNumber.toString());
core.setOutput('pr_head_sha', prHeadSha);
core.setOutput('is_fork_pr', isFork ? 'true' : 'false');
// Check if PR author has write access
const isPrivileged = await hasWriteAccess(prAuthor);
console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`);
// Check if we already posted a bot comment
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber
});
const botComment = comments.data.find(c =>
c.user.type === 'Bot' &&
c.body.includes('ok-to-test')
);
// Helper to safely post a comment (may fail on fork PRs due to permissions)
async function tryPostComment(body) {
try {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
return true;
} catch (e) {
// Fork PRs can't post comments on pull_request event (GitHub security restriction)
console.log(`Could not post comment (expected for fork PRs): ${e.message}`);
return false;
}
}
if (isPrivileged) {
// For maintainer/admin fork PRs, we need to trigger via /ok-to-test
// because fork PRs don't have access to secrets on pull_request event
if (isFork) {
console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`);
core.setOutput('should_run', 'false'); // Don't run on pull_request event
// Auto-post /ok-to-test to trigger issue_comment workflow
if (!botComment) {
const posted = await tryPostComment(`/ok-to-test`);
if (!posted) {
console.log('Note: Maintainer will need to manually comment /ok-to-test');
}
}
return;
}
// Non-fork PR from maintainer - run directly
core.setOutput('should_run', 'true');
return;
}
// External contributor - post instructions and skip
console.log('External contributor PR - posting instructions');
core.setOutput('should_run', 'false');
if (!botComment) {
const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so the e2e tests require approval to run (they use cluster resources).\n\n**For maintainers/admins:** Comment \`/ok-to-test\` to trigger the e2e tests after reviewing the code.\n\n**For contributors:** Please wait for a maintainer or admin to approve running the tests.`);
if (!posted) {
console.log('Note: Could not post instructions comment on fork PR');
}
}
- name: Write workflow summary
if: always()
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const shouldRun = '${{ steps.check.outputs.should_run }}';
if (shouldRun === 'true') {
core.summary.addRaw('**E2E tests will run** for this trigger.');
} else {
core.summary.addRaw('**E2E tests were skipped** (gate check did not pass for this trigger).');
}
await core.summary.write();
# Build the FMA controller image on GitHub-hosted runner
# Uses ko (Go-native image builder) and pushes to GHCR
# Note: Skip for fork PRs on pull_request event (no secrets access).
# For fork PRs, build-image runs via issue_comment trigger (/ok-to-test).
build-image:
needs: gate
if: |
needs.gate.outputs.should_run == 'true' &&
(needs.gate.outputs.is_fork_pr != 'true' || github.event_name != 'pull_request')
runs-on: ubuntu-latest
outputs:
image_tag: ${{ steps.build.outputs.image_tag }}
controller_image: ${{ steps.build.outputs.controller_image }}
requester_image: ${{ steps.build.outputs.requester_image }}
launcher_image: ${{ steps.build.outputs.launcher_image }}
steps:
- name: Checkout source
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Set up Go
uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
with:
go-version: "1.25.7"
cache-dependency-path: ./go.sum
- name: Set up ko
uses: ko-build/setup-ko@d006021bd0c28d1ce33a07e7943d48b079944c8d # v0.9
- name: Log in to GHCR
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
with:
registry: ghcr.io
username: ${{ secrets.CR_USER }}
password: ${{ secrets.CR_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
- name: Build and push images
id: build
env:
GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
run: |
# Use first 8 chars of the git ref (POSIX-compliant)
IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" | cut -c1-8)"
reg="${{ github.repository }}"
CONTAINER_IMG_REG="ghcr.io/${reg@L}"
echo "Building images with tag: $IMAGE_TAG"
echo "Registry: $CONTAINER_IMG_REG"
# Build controller (ko)
make build-controller \
CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
IMAGE_TAG="$IMAGE_TAG"
# Build launcher-populator (ko)
make build-populator \
CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
IMAGE_TAG="$IMAGE_TAG"
# Build requester (Docker, multi-platform)
make build-and-push-requester \
CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
REQUESTER_IMG_TAG="$IMAGE_TAG"
# Build launcher (Docker, GPU-capable)
make build-and-push-launcher \
CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
LAUNCHER_IMG_TAG="$IMAGE_TAG"
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "controller_image=${CONTAINER_IMG_REG}/dual-pods-controller:${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "requester_image=${CONTAINER_IMG_REG}/requester:${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "launcher_image=${CONTAINER_IMG_REG}/launcher:${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "All images built and pushed"
# Run e2e tests on OpenShift self-hosted runner
e2e-openshift:
runs-on: [self-hosted, openshift, vllm-d]
needs: [gate, build-image]
if: needs.gate.outputs.should_run == 'true'
env:
SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
# PR-specific namespace for isolation between concurrent PR tests
FMA_NAMESPACE: fma-e2e-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Unique release name per run to avoid conflicts
FMA_CHART_INSTANCE_NAME: fma-e2e-${{ github.run_id }}
# Image registry and tag from the build job
IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
# LAUNCHER_IMAGE and REQUESTER_IMAGE are needed by test object creation
# and cleanup step (rm-images-from-ocp-nodes.sh)
LAUNCHER_IMAGE: ${{ needs.build-image.outputs.launcher_image }}
REQUESTER_IMAGE: ${{ needs.build-image.outputs.requester_image }}
steps:
- name: Checkout source
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Install tools (kubectl, oc, helm)
run: |
# Install kubectl - pinned version for reproducible CI builds
KUBECTL_VERSION="v1.31.0"
echo "Installing kubectl version: $KUBECTL_VERSION"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256"
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
rm -f kubectl.sha256
# Install oc (OpenShift CLI)
curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
tar -xzf openshift-client-linux.tar.gz
sudo mv oc /usr/local/bin/
rm -f openshift-client-linux.tar.gz kubectl README.md
# Install helm
curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: Verify cluster access
run: |
echo "Verifying cluster access..."
kubectl cluster-info
kubectl get nodes
- name: Dump select info about each node with a GPU
run: |
for nodename in $(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[*].metadata.name}'); do
echo "For ${nodename}:"
echo "taints: $(kubectl get node $nodename -o jsonpath='{.spec.taints}')"
echo "conditions: $(kubectl get node $nodename -o jsonpath='{.status.conditions}' | jq .)"
echo "FMA images: $(kubectl get node $nodename -o jsonpath='{.status.images}' | jq '[ .[] | select(.names | any(contains("fast-model-actuation"))) | {"names":.names, "sizeMB":(.sizeBytes/1048576|floor) } ]')"
echo
done
continue-on-error: true
- name: Detect cluster type
run: |
CLUSTER_DOMAIN=$(oc get ingress.config cluster -o jsonpath='{.spec.domain}')
echo "Cluster domain: $CLUSTER_DOMAIN"
if echo "$CLUSTER_DOMAIN" | grep -q "pokprod"; then
echo "Detected pokprod cluster"
echo "CLUSTER_TYPE=pokprod" >> $GITHUB_ENV
fi
- name: Clean up resources for this PR
run: |
echo "Cleaning up FMA resources for this PR..."
echo " FMA_NAMESPACE: $FMA_NAMESPACE"
if kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; then
echo "=== Cleaning up namespace: $FMA_NAMESPACE ==="
# Uninstall all helm releases in the namespace
for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$FMA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
done
# Remove dual-pods.llm-d.ai/* finalizers from all pods so namespace deletion is not blocked
echo " Removing dual-pods finalizers from pods in $FMA_NAMESPACE..."
for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
all_finalizers=$(kubectl get pod "$pod" -n "$FMA_NAMESPACE" \
-o jsonpath='{range .metadata.finalizers[*]}{@}{"\n"}{end}' 2>/dev/null || true)
if ! echo "$all_finalizers" | grep -q '^dual-pods\.llm-d\.ai/'; then
continue
fi
echo " Patching pod $pod to remove dual-pods finalizers"
keep_entries=$(echo "$all_finalizers" \
| grep -v '^dual-pods\.llm-d\.ai/' \
| awk 'NR>1{printf ","} {printf "\"%s\"", $0}')
kubectl patch pod "$pod" -n "$FMA_NAMESPACE" --type=merge \
-p="{\"metadata\":{\"finalizers\":[${keep_entries}]}}" 2>/dev/null || true
done
echo " Deleting namespace: $FMA_NAMESPACE"
kubectl delete namespace "$FMA_NAMESPACE" --ignore-not-found --timeout=120s || true
else
echo "Namespace $FMA_NAMESPACE does not exist, skipping cleanup"
fi
# Clean up cluster-scoped resources from previous runs
echo "Cleaning up cluster-scoped resources..."
kubectl delete clusterrole "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
echo "Cleanup complete"
- name: Create namespace
run: |
# Wait for namespace to be fully deleted if still terminating
if kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; then
echo "Waiting for namespace $FMA_NAMESPACE to be deleted..."
while kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; do
echo "Namespace still terminating..."
sleep 2
done
fi
echo "Creating namespace $FMA_NAMESPACE..."
kubectl create namespace "$FMA_NAMESPACE"
- name: Create GHCR image pull secret
env:
CR_USER: ${{ secrets.CR_USER }}
CR_TOKEN: ${{ secrets.CR_TOKEN }}
run: |
echo "Creating GHCR image pull secret in $FMA_NAMESPACE..."
kubectl create secret docker-registry ghcr-pull-secret \
--docker-server=ghcr.io \
--docker-username="$CR_USER" \
--docker-password="$CR_TOKEN" \
-n "$FMA_NAMESPACE"
# Patch default SA so all pods in the namespace can pull from GHCR
kubectl patch serviceaccount default -n "$FMA_NAMESPACE" \
-p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
echo "GHCR pull secret created and attached to default SA"
- name: Deploy FMA (CRDs and controllers)
id: deploy-fma
env:
CONTAINER_IMG_REG: ghcr.io/${{ github.repository }}
IMAGE_TAG: ${{ env.IMAGE_TAG }}
NODE_VIEW_CLUSTER_ROLE: "create/please"
run: |
# Force container registry to lowercase, because this is how
# ghcr.io relates images to their source org/repo.
export CONTAINER_IMG_REG="${CONTAINER_IMG_REG,,}"
echo "Running deploy_fma.sh..."
./test/e2e/deploy_fma.sh
- name: Set up test service account
run: |
echo "Creating service account for test workloads..."
# The real requester does not interact with the Kubernetes API.
# The service account is only needed for imagePullSecrets.
# NOTE: If the launcher populator is later integrated into this e2e test
# and shares this SA, a Role with access to launcherconfigs and
# launcherpopulationpolicies will need to be added back.
kubectl create sa testreq -n "$FMA_NAMESPACE" || true
kubectl patch serviceaccount testreq -n "$FMA_NAMESPACE" \
-p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
echo "Service account created"
- name: Create test objects
id: test-objects
run: |
INST=$(date +%d-%H-%M-%S)
echo "Creating test objects with instance: $INST"
RUNTIME_CLASS=""
if [ "$CLUSTER_TYPE" = "pokprod" ]; then
RUNTIME_CLASS="runtimeClassName: nvidia"
fi
kubectl apply -n "$FMA_NAMESPACE" -f - <<EOF
apiVersion: fma.llm-d.ai/v1alpha1
kind: InferenceServerConfig
metadata:
name: inference-server-config-${INST}
spec:
modelServerConfig:
port: 8005
options: "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --enable-sleep-mode"
env_vars:
VLLM_SERVER_DEV_MODE: "1"
VLLM_USE_V1: "1"
VLLM_LOGGING_LEVEL: "DEBUG"
labels:
component: inference
annotations:
description: "E2E test InferenceServerConfig"
launcherConfigName: launcher-config-${INST}
---
apiVersion: fma.llm-d.ai/v1alpha1
kind: LauncherConfig
metadata:
name: launcher-config-${INST}
spec:
maxSleepingInstances: 3
podTemplate:
spec:
${RUNTIME_CLASS}
imagePullSecrets:
- name: ghcr-pull-secret
containers:
- name: inference-server
image: ${LAUNCHER_IMAGE}
imagePullPolicy: Always
command:
- /app/launcher.py
- --host=0.0.0.0
- --log-level=info
- --port=8001
env:
- name: HF_HOME
value: "/tmp"
- name: VLLM_CACHE_ROOT
value: "/tmp"
- name: FLASHINFER_WORKSPACE_BASE
value: "/tmp"
- name: TRITON_CACHE_DIR
value: "/tmp"
- name: XDG_CACHE_HOME
value: "/tmp"
- name: XDG_CONFIG_HOME
value: "/tmp"
---
apiVersion: fma.llm-d.ai/v1alpha1
kind: LauncherPopulationPolicy
metadata:
name: lpp-${INST}
spec:
enhancedNodeSelector:
labelSelector:
matchLabels:
nvidia.com/gpu.present: "true"
countForLauncher:
- launcherConfigName: launcher-config-${INST}
launcherCount: 1
---
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: my-request-${INST}
labels:
app: dp-example
spec:
replicas: 1
selector:
matchLabels:
app: dp-example
template:
metadata:
labels:
app: dp-example
instance: "${INST}"
annotations:
dual-pods.llm-d.ai/admin-port: "8081"
dual-pods.llm-d.ai/inference-server-config: "inference-server-config-${INST}"
spec:
${RUNTIME_CLASS}
imagePullSecrets:
- name: ghcr-pull-secret
containers:
- name: inference-server
image: ${REQUESTER_IMAGE}
imagePullPolicy: Always
ports:
- name: probes
containerPort: 8080
- name: spi
containerPort: 8081
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 2
periodSeconds: 5
resources:
limits:
nvidia.com/gpu: "1"
cpu: "200m"
memory: 250Mi
serviceAccount: testreq
EOF
echo "instance=${INST}" >> $GITHUB_OUTPUT
echo "isc=inference-server-config-${INST}" >> $GITHUB_OUTPUT
echo "lc=launcher-config-${INST}" >> $GITHUB_OUTPUT
echo "lpp=lpp-${INST}" >> $GITHUB_OUTPUT
echo "rs=my-request-${INST}" >> $GITHUB_OUTPUT
echo "Test objects created"
- name: Verify launcher pod creation and binding
env:
INST: ${{ steps.test-objects.outputs.instance }}
LC: ${{ steps.test-objects.outputs.lc }}
run: |
echo "Waiting for requester pod..."
ELAPSED=0
LIMIT=300
while true; do
COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o json 2>/dev/null | jq '.items | length')
if [ "$COUNT" -ge 1 ]; then
echo "Requester pod found"
break
fi
if [ "$ELAPSED" -ge "$LIMIT" ]; then
echo "::error::Requester pod did not appear within ${LIMIT}s"
exit 1
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done
REQUESTER=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o json | jq -r '.items[0].metadata.name')
echo "Requester pod: $REQUESTER"
# LauncherPopulationPolicy specifies launcherCount per node with nvidia.com/gpu.present=true
GPU_NODES=$(kubectl get nodes -l nvidia.com/gpu.present=true --field-selector spec.unschedulable!=true -o name | wc -l | tr -d ' ')
echo "Expecting launcher-populator to create $GPU_NODES launcher(s)"
echo "Waiting for launcher-populator to create launcher pods..."
ELAPSED=0
while true; do
COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o json 2>/dev/null | jq '.items | length')
if [ "$COUNT" -ge "$GPU_NODES" ]; then
echo "Launcher-populator created $COUNT launcher(s) successfully"
kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o wide
break
fi
if [ "$ELAPSED" -ge "$LIMIT" ]; then
echo "::error::Launcher-populator did not create expected $GPU_NODES launcher(s) within ${LIMIT}s (found: $COUNT)"
exit 1
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done
# 1. Wait for at least two launcher pods to be Ready.
# The controller will not bind until a launcher is ready, so waiting here
# makes it easier to diagnose problems that prevent readiness.
# Launcher image is ~20GB, so allow extra time for uncached pulls.
# Some GPU nodes may be ineligible for scheduling on shared clusters, so
# require a smaller healthy subset instead of every created launcher pod.
echo "Waiting for at least two launcher pod(s) to be Ready..."
ELAPSED=0
READY_LAUNCHERS=0
# Temporary workaround: require only two ready launchers until the
# test accounts for tainted or otherwise ineligible GPU nodes.
READY_TARGET=2
while true; do
READY_LAUNCHERS=$(kubectl get pods -n "$FMA_NAMESPACE" \
-l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o json \
| jq '[.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status == "True"))] | length')
if [ "$READY_LAUNCHERS" -ge "$READY_TARGET" ]; then
echo "$READY_LAUNCHERS launcher pod(s) are Ready"
kubectl get pods -n "$FMA_NAMESPACE" \
-l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o wide
break
fi
if [ "$ELAPSED" -ge 600 ]; then
echo "::error::Fewer than ${READY_TARGET} launcher pod(s) became Ready within 600s (ready: $READY_LAUNCHERS)"
exit 1
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done
# 2. Verify launcher-to-requester binding.
# After launcher is ready, the controller binds by setting dual labels.
# Use the dual label to find the launcher bound to our requester.
echo "Verifying launcher-to-requester binding..."
ELAPSED=0
while true; do
LAUNCHER=$(kubectl get pods -n "$FMA_NAMESPACE" \
-l "dual-pods.llm-d.ai/launcher-config-name=$LC,dual-pods.llm-d.ai/dual=$REQUESTER" \
-o json | jq -r '.items[0].metadata.name // empty')
if [ -n "$LAUNCHER" ]; then
echo "Launcher bound to requester: $LAUNCHER -> $REQUESTER"
break
fi
if [ "$ELAPSED" -ge "$LIMIT" ]; then
echo "::error::Launcher-to-requester binding not established within ${LIMIT}s"
exit 1
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done
# 3. Verify requester-to-launcher binding (reverse direction).
echo "Verifying requester-to-launcher binding..."
ELAPSED=0
while true; do
REQUESTER_DUAL=$(kubectl get pod "$REQUESTER" -n "$FMA_NAMESPACE" -o json | jq -r '.metadata.labels["dual-pods.llm-d.ai/dual"] // empty')
if [ "$REQUESTER_DUAL" = "$LAUNCHER" ]; then
echo "Requester bound to launcher: $REQUESTER -> $LAUNCHER"
break
fi
if [ "$ELAPSED" -ge "$LIMIT" ]; then
echo "::error::Requester-to-launcher binding not established within ${LIMIT}s"
echo " Requester dual label: '$REQUESTER_DUAL' (expected: '$LAUNCHER')"
exit 1
fi
sleep 5
ELAPSED=$((ELAPSED + 5))
done
# 4. Wait for requester to be Ready.
# The controller ordering is: launcher ready -> binding -> instance created
# -> inference server ready -> requester ready.
echo "Waiting for requester pod to be Ready..."
kubectl wait --for=condition=Ready "pod/$REQUESTER" -n "$FMA_NAMESPACE" --timeout=120s
echo ""
echo "=== Launcher test passed: pods Ready and bound ==="
kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
- name: List objects of category all
if: always()
run: kubectl get all -n "$FMA_NAMESPACE"
- name: Dump all Pods
if: always()
run: kubectl get pods -n "$FMA_NAMESPACE" -o yaml
- name: List event objects
if: always()
run: kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp'
- name: Dump Pod logs
if: always()
run: |
for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o 'jsonpath={.items[*].metadata.name} ') ; do
echo ""
echo "=== Previous log of $pod ==="
kubectl logs -n "$FMA_NAMESPACE" $pod --previous || true
echo ""
echo "=== Log of $pod ==="
kubectl logs -n "$FMA_NAMESPACE" $pod || true
done
- name: Dump vLLM instance logs from launchers
if: always()
run: scripts/dump-launcher-vllm-logs.sh "$FMA_NAMESPACE"
- name: Clean up test objects
if: always()
env:
ISC: ${{ steps.test-objects.outputs.isc }}
LC: ${{ steps.test-objects.outputs.lc }}
LPP: ${{ steps.test-objects.outputs.lpp }}
RS: ${{ steps.test-objects.outputs.rs }}
run: |
echo "Cleaning up test objects..."
kubectl delete rs "$RS" -n "$FMA_NAMESPACE" --ignore-not-found || true
kubectl delete launcherpopulationpolicy "$LPP" -n "$FMA_NAMESPACE" --ignore-not-found || true
kubectl delete inferenceserverconfig "$ISC" -n "$FMA_NAMESPACE" --ignore-not-found || true
kubectl delete launcherconfig "$LC" -n "$FMA_NAMESPACE" --ignore-not-found || true
# Wait for test pods to terminate
sleep 10
echo "Test objects cleaned up"
- name: Cleanup infrastructure
# Cleanup unless told not to
if: always() && env.SKIP_CLEANUP != 'true'
run: |
echo "Cleaning up all FMA test infrastructure..."
echo " FMA_NAMESPACE: $FMA_NAMESPACE"
echo " FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
# Uninstall Helm releases
for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
echo " Uninstalling helm release: $release"
helm uninstall "$release" -n "$FMA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
done
# Remove dual-pods.llm-d.ai/* finalizers from all pods so namespace deletion is not blocked
echo " Removing dual-pods finalizers from pods in $FMA_NAMESPACE..."
for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
all_finalizers=$(kubectl get pod "$pod" -n "$FMA_NAMESPACE" \
-o jsonpath='{range .metadata.finalizers[*]}{@}{"\n"}{end}' 2>/dev/null || true)
if ! echo "$all_finalizers" | grep -q '^dual-pods\.llm-d\.ai/'; then
continue
fi
echo " Patching pod $pod to remove dual-pods finalizers"
keep_entries=$(echo "$all_finalizers" \
| grep -v '^dual-pods\.llm-d\.ai/' \
| awk 'NR>1{printf ","} {printf "\"%s\"", $0}')
kubectl patch pod "$pod" -n "$FMA_NAMESPACE" --type=merge \
-p="{\"metadata\":{\"finalizers\":[${keep_entries}]}}" 2>/dev/null || true
done
# Delete namespace
kubectl delete namespace "$FMA_NAMESPACE" \
--ignore-not-found --timeout=120s || true
# Delete cluster-scoped stuff for reading Node objects
kubectl delete clusterrole "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
echo "Cleanup complete"
- name: Remove test images from cluster nodes
if: always() && env.SKIP_CLEANUP != 'true'
run: scripts/rm-images-from-ocp-nodes.sh "$LAUNCHER_IMAGE"
# Report status back to PR for issue_comment triggered runs
# This ensures fork PRs show the correct status after /ok-to-test runs complete
report-status:
runs-on: ubuntu-latest
needs: [gate, e2e-openshift]
# Run always (even on failure) but only for issue_comment events
if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true'
steps:
- name: Report status to PR
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
const e2eResult = '${{ needs.e2e-openshift.result }}';
// Map job result to commit status
let state, description;
if (e2eResult === 'success') {
state = 'success';
description = 'E2E tests passed';
} else if (e2eResult === 'skipped') {
state = 'pending';
description = 'E2E tests skipped';
} else if (e2eResult === 'cancelled') {
state = 'failure';
description = 'E2E tests cancelled';
} else {
state = 'failure';
description = 'E2E tests failed';
}
console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: prHeadSha,
state: state,
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: description,
context: '${{ github.workflow }} / e2e (comment trigger)'
});
console.log('Status reported successfully');