Test cases for multiple instances sharing one launcher pod #284

Workflow file for this run

.github/workflows/ci-e2e-openshift.yaml at ccb7a55

	name: CI - OpenShift E2E Tests

	# Permissions needed for various jobs
	permissions:
	contents: read
	pull-requests: write # For posting comments on PRs
	statuses: write # For reporting status on fork PR commits

	# Cancel previous runs on the same PR to avoid resource conflicts
	# Only group by PR number for legitimate triggers (pull_request, workflow_dispatch, /ok-to-test, or /retest comments)
	# Regular comments get a unique group (run_id) so they don't cancel in-progress test runs
	#
	# Logic:
	# - Regular comments (not /ok-to-test or /retest): unique group prevents cancellation of real tests
	# - Valid triggers: group 'fma-e2e-openshift-{pr_number}' (can cancel previous runs for same PR)
	# - Fallback chain for ID: pull_request.number -> issue.number -> run_id
	#
	# NOTE: Valid command list (/ok-to-test, /retest) must stay in sync with gate job validation
	concurrency:
	group: >-
	${{
	github.event_name == 'issue_comment' &&
	!contains(github.event.comment.body, '/ok-to-test') &&
	!contains(github.event.comment.body, '/retest')
	&& format('comment-isolated-{0}', github.run_id)
	\|\| format('fma-e2e-openshift-{0}',
	github.event.pull_request.number
	\|\| github.event.issue.number
	\|\| github.run_id)
	}}
	cancel-in-progress: true

	on:
	pull_request:
	branches:
	- main
	# Allow maintainers to trigger tests on fork PRs via /ok-to-test comment
	issue_comment:
	types: [created]
	workflow_dispatch:
	inputs:
	skip_cleanup:
	description: 'Skip cleanup after tests'
	required: false
	default: 'false'

	jobs:
	# Gate: Check permissions and handle /ok-to-test for fork PRs
	# - Maintainers (write access): Tests run automatically
	# - External contributors: Must wait for maintainer to comment /ok-to-test
	gate:
	runs-on: ubuntu-latest
	outputs:
	should_run: ${{ steps.check.outputs.should_run }}
	pr_number: ${{ steps.check.outputs.pr_number }}
	pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
	is_fork_pr: ${{ steps.check.outputs.is_fork_pr }}
	steps:
	- name: Check permissions and /ok-to-test
	id: check
	uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
	with:
	script: \|
	// Helper to check if user has write access
	async function hasWriteAccess(username) {
	try {
	const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
	owner: context.repo.owner,
	repo: context.repo.repo,
	username: username
	});
	const privilegedRoles = ['admin', 'maintain', 'write'];
	return privilegedRoles.includes(permission.permission);
	} catch (e) {
	console.log(`Could not get permissions for ${username}: ${e.message}`);
	return false;
	}
	}

	// Always run for workflow_dispatch
	if (context.eventName === 'workflow_dispatch') {
	core.setOutput('should_run', 'true');
	core.setOutput('pr_number', '');
	core.setOutput('pr_head_sha', context.sha);
	core.setOutput('is_fork_pr', 'false');
	return;
	}

	// Handle issue_comment event (/ok-to-test or /retest)
	if (context.eventName === 'issue_comment') {
	const comment = context.payload.comment.body.trim();
	const issue = context.payload.issue;

	// Only process /ok-to-test or /retest comments on PRs
	if (!issue.pull_request) {
	console.log('Comment is not on a PR, skipping');
	core.setOutput('should_run', 'false');
	return;
	}

	// NOTE: This list must stay in sync with concurrency group logic
	const validCommands = ['/ok-to-test', '/retest'];
	if (!validCommands.includes(comment)) {
	console.log(`Comment "${comment}" is not a valid trigger command, skipping`);
	core.setOutput('should_run', 'false');
	return;
	}

	// Check if commenter has write access
	const commenter = context.payload.comment.user.login;
	const hasAccess = await hasWriteAccess(commenter);
	if (!hasAccess) {
	console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
	core.setOutput('should_run', 'false');
	return;
	}

	// Get PR details to get head SHA
	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: issue.number
	});

	// Check if PR is from a fork
	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
	const isFork = headRepo !== baseRepo;

	console.log(`${comment} approved by ${commenter} for PR #${issue.number}`);
	console.log(`PR head SHA: ${pr.head.sha}`);
	console.log(`Is fork PR: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);
	core.setOutput('should_run', 'true');
	core.setOutput('pr_number', issue.number.toString());
	core.setOutput('pr_head_sha', pr.head.sha);
	core.setOutput('is_fork_pr', isFork ? 'true' : 'false');

	// Add reaction to acknowledge
	await github.rest.reactions.createForIssueComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: context.payload.comment.id,
	content: 'rocket'
	});

	// Post comment with link to the e2e workflow run
	const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: issue.number,
	body: `🚀 E2E tests triggered by ${comment}\n\n[View the OpenShift E2E workflow run](${runUrl})`
	});
	return;
	}

	// Handle pull_request event
	const pr = context.payload.pull_request;
	const prAuthor = pr.user.login;
	const prNumber = pr.number;
	const prHeadSha = pr.head.sha;

	// Check if PR is from a fork
	const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
	const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
	const isFork = headRepo !== baseRepo;
	console.log(`PR #${prNumber} is from fork: ${isFork} (head: ${headRepo}, base: ${baseRepo})`);

	core.setOutput('pr_number', prNumber.toString());
	core.setOutput('pr_head_sha', prHeadSha);
	core.setOutput('is_fork_pr', isFork ? 'true' : 'false');

	// Check if PR author has write access
	const isPrivileged = await hasWriteAccess(prAuthor);
	console.log(`PR #${prNumber} author ${prAuthor}: privileged=${isPrivileged}`);

	// Check if we already posted a bot comment
	const comments = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber
	});

	const botComment = comments.data.find(c =>
	c.user.type === 'Bot' &&
	c.body.includes('ok-to-test')
	);

	// Helper to safely post a comment (may fail on fork PRs due to permissions)
	async function tryPostComment(body) {
	try {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body: body
	});
	return true;
	} catch (e) {
	// Fork PRs can't post comments on pull_request event (GitHub security restriction)
	console.log(`Could not post comment (expected for fork PRs): ${e.message}`);
	return false;
	}
	}

	if (isPrivileged) {
	// For maintainer/admin fork PRs, we need to trigger via /ok-to-test
	// because fork PRs don't have access to secrets on pull_request event
	if (isFork) {
	console.log(`Maintainer fork PR detected - auto-triggering /ok-to-test for ${prAuthor}`);
	core.setOutput('should_run', 'false'); // Don't run on pull_request event

	// Auto-post /ok-to-test to trigger issue_comment workflow
	if (!botComment) {
	const posted = await tryPostComment(`/ok-to-test`);
	if (!posted) {
	console.log('Note: Maintainer will need to manually comment /ok-to-test');
	}
	}
	return;
	}
	// Non-fork PR from maintainer - run directly
	core.setOutput('should_run', 'true');
	return;
	}

	// External contributor - post instructions and skip
	console.log('External contributor PR - posting instructions');
	core.setOutput('should_run', 'false');

	if (!botComment) {
	const posted = await tryPostComment(`👋 Thanks for your contribution!\n\nThis PR is from a fork, so the e2e tests require approval to run (they use cluster resources).\n\nFor maintainers/admins: Comment \`/ok-to-test\` to trigger the e2e tests after reviewing the code.\n\nFor contributors: Please wait for a maintainer or admin to approve running the tests.`);
	if (!posted) {
	console.log('Note: Could not post instructions comment on fork PR');
	}
	}

	# Build the FMA controller image on GitHub-hosted runner
	# Uses ko (Go-native image builder) and pushes to GHCR
	# Note: Skip for fork PRs on pull_request event (no secrets access).
	# For fork PRs, build-image runs via issue_comment trigger (/ok-to-test).
	build-image:
	needs: gate
	if: \|
	needs.gate.outputs.should_run == 'true' &&
	(needs.gate.outputs.is_fork_pr != 'true' \|\| github.event_name != 'pull_request')
	runs-on: ubuntu-latest
	outputs:
	image_tag: ${{ steps.build.outputs.image_tag }}
	controller_image: ${{ steps.build.outputs.controller_image }}
	test_requester_image: ${{ steps.build.outputs.test_requester_image }}
	test_launcher_image: ${{ steps.build.outputs.test_launcher_image }}
	steps:
	- name: Checkout source
	uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
	with:
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Set up Go
	uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
	with:
	go-version: "1.25.7"
	cache-dependency-path: ./go.sum

	- name: Set up ko
	uses: ko-build/setup-ko@d006021bd0c28d1ce33a07e7943d48b079944c8d # v0.9

	- name: Log in to GHCR
	uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5
	with:
	registry: ghcr.io
	username: ${{ secrets.CR_USER }}
	password: ${{ secrets.CR_TOKEN }}

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0

	- name: Build and push images
	id: build
	env:
	GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
	run: \|
	# Use first 8 chars of the git ref (POSIX-compliant)
	IMAGE_TAG="ref-$(printf '%s' "$GIT_REF" \| cut -c1-8)"
	reg="${{ github.repository }}"
	CONTAINER_IMG_REG="ghcr.io/${reg@L}"

	echo "Building images with tag: $IMAGE_TAG"
	echo "Registry: $CONTAINER_IMG_REG"

	# Build controller (ko)
	make build-controller \
	CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
	IMAGE_TAG="$IMAGE_TAG"

	# Build launcher-populator (ko)
	make build-populator \
	CONTAINER_IMG_REG="$CONTAINER_IMG_REG" \
	IMAGE_TAG="$IMAGE_TAG"

	# Build test-requester (ko)
	KO_DOCKER_REPO="$CONTAINER_IMG_REG" \
	ko build -B ./cmd/test-requester -t "$IMAGE_TAG" --platform linux/amd64

	# Build test-launcher (Docker)
	docker buildx build --push \
	-t "${CONTAINER_IMG_REG}/test-launcher:${IMAGE_TAG}" \
	-f dockerfiles/Dockerfile.launcher.cpu . \
	--platform linux/amd64

	echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
	echo "controller_image=${CONTAINER_IMG_REG}/dual-pods-controller:${IMAGE_TAG}" >> $GITHUB_OUTPUT
	echo "test_requester_image=${CONTAINER_IMG_REG}/test-requester:${IMAGE_TAG}" >> $GITHUB_OUTPUT
	echo "test_launcher_image=${CONTAINER_IMG_REG}/test-launcher:${IMAGE_TAG}" >> $GITHUB_OUTPUT
	echo "All images built and pushed"

	# Run e2e tests on OpenShift self-hosted runner
	e2e-openshift:
	runs-on: [self-hosted, openshift]
	needs: [gate, build-image]
	if: needs.gate.outputs.should_run == 'true'
	env:
	SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup \|\| 'false' }}
	# PR-specific namespace for isolation between concurrent PR tests
	FMA_NAMESPACE: fma-e2e-pr-${{ needs.gate.outputs.pr_number \|\| github.run_id }}
	# Unique release name per run to avoid conflicts
	FMA_RELEASE_NAME: fma-e2e-${{ github.run_id }}
	# Use the images built in the previous job
	CONTROLLER_IMAGE: ${{ needs.build-image.outputs.controller_image }}
	TEST_REQUESTER_IMAGE: ${{ needs.build-image.outputs.test_requester_image }}
	TEST_LAUNCHER_IMAGE: ${{ needs.build-image.outputs.test_launcher_image }}
	steps:
	- name: Checkout source
	uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
	with:
	ref: ${{ needs.gate.outputs.pr_head_sha }}

	- name: Install tools (kubectl, oc, helm)
	run: \|
	# Install kubectl - pinned version for reproducible CI builds
	KUBECTL_VERSION="v1.31.0"
	echo "Installing kubectl version: $KUBECTL_VERSION"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
	curl -fsSL --retry 3 --retry-delay 5 -o kubectl.sha256 "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl.sha256"
	echo "$(cat kubectl.sha256) kubectl" \| sha256sum --check
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	rm -f kubectl.sha256
	# Install oc (OpenShift CLI)
	curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
	tar -xzf openshift-client-linux.tar.gz
	sudo mv oc /usr/local/bin/
	rm -f openshift-client-linux.tar.gz kubectl README.md
	# Install helm
	curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	- name: Verify cluster access
	run: \|
	echo "Verifying cluster access..."
	kubectl cluster-info
	kubectl get nodes

	- name: Clean up resources for this PR
	run: \|
	echo "Cleaning up FMA resources for this PR..."
	echo " FMA_NAMESPACE: $FMA_NAMESPACE"

	if kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; then
	echo "=== Cleaning up namespace: $FMA_NAMESPACE ==="
	# Uninstall all helm releases in the namespace
	for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
	echo " Uninstalling helm release: $release"
	helm uninstall "$release" -n "$FMA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	done
	# Remove dual-pods.llm-d.ai/* finalizers from all pods so namespace deletion is not blocked
	echo " Removing dual-pods finalizers from pods in $FMA_NAMESPACE..."
	for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
	all_finalizers=$(kubectl get pod "$pod" -n "$FMA_NAMESPACE" \
	-o jsonpath='{range .metadata.finalizers[*]}{@}{"\n"}{end}' 2>/dev/null \|\| true)
	if ! echo "$all_finalizers" \| grep -q '^dual-pods\.llm-d\.ai/'; then
	continue
	fi
	echo " Patching pod $pod to remove dual-pods finalizers"
	keep_entries=$(echo "$all_finalizers" \
	\| grep -v '^dual-pods\.llm-d\.ai/' \
	\| awk 'NR>1{printf ","} {printf "\"%s\"", $0}')
	kubectl patch pod "$pod" -n "$FMA_NAMESPACE" --type=merge \
	-p="{\"metadata\":{\"finalizers\":[${keep_entries}]}}" 2>/dev/null \|\| true
	done
	echo " Deleting namespace: $FMA_NAMESPACE"
	kubectl delete namespace "$FMA_NAMESPACE" --ignore-not-found --timeout=120s \|\| true
	else
	echo "Namespace $FMA_NAMESPACE does not exist, skipping cleanup"
	fi

	# Clean up cluster-scoped resources from previous runs
	echo "Cleaning up cluster-scoped resources..."
	kubectl delete clusterrolebinding -l app.kubernetes.io/name=fma-controllers --ignore-not-found 2>/dev/null \|\| true
	kubectl delete clusterrole fma-node-viewer --ignore-not-found \|\| true

	# Clean up ValidatingAdmissionPolicy resources (cluster-scoped, created by dual-pods-controller chart)
	kubectl delete validatingadmissionpolicy fma-bound-serverreqpod fma-immutable-fields --ignore-not-found 2>/dev/null \|\| true
	kubectl delete validatingadmissionpolicybinding bind-fma-bound-serverreqpod bind-fma-immutable-fields --ignore-not-found 2>/dev/null \|\| true

	echo "Cleanup complete"

	- name: Create namespace
	run: \|
	# Wait for namespace to be fully deleted if still terminating
	if kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; then
	echo "Waiting for namespace $FMA_NAMESPACE to be deleted..."
	while kubectl get namespace "$FMA_NAMESPACE" &>/dev/null; do
	echo "Namespace still terminating..."
	sleep 2
	done
	fi

	echo "Creating namespace $FMA_NAMESPACE..."
	kubectl create namespace "$FMA_NAMESPACE"

	- name: Create GHCR image pull secret
	env:
	CR_USER: ${{ secrets.CR_USER }}
	CR_TOKEN: ${{ secrets.CR_TOKEN }}
	run: \|
	echo "Creating GHCR image pull secret in $FMA_NAMESPACE..."
	kubectl create secret docker-registry ghcr-pull-secret \
	--docker-server=ghcr.io \
	--docker-username="$CR_USER" \
	--docker-password="$CR_TOKEN" \
	-n "$FMA_NAMESPACE"

	# Patch default SA so all pods in the namespace can pull from GHCR
	kubectl patch serviceaccount default -n "$FMA_NAMESPACE" \
	-p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
	echo "GHCR pull secret created and attached to default SA"

	- name: Apply FMA CRDs
	run: \|
	echo "Applying FMA CRDs..."
	kubectl apply --server-side -f config/crd/

	# Verify CRDs are registered
	echo "Verifying CRDs..."
	kubectl get crd inferenceserverconfigs.fma.llm-d.ai
	kubectl get crd launcherconfigs.fma.llm-d.ai
	kubectl get crd launcherpopulationpolicies.fma.llm-d.ai
	echo "All CRDs registered successfully"

	- name: Create ConfigMaps
	run: \|
	echo "Creating gpu-map and gpu-allocs ConfigMaps in $FMA_NAMESPACE..."
	# The controller requires these ConfigMaps to exist.
	# Empty maps are sufficient for install verification; the controller starts
	# but cannot schedule dual pods without real GPU data.
	kubectl create configmap gpu-map -n "$FMA_NAMESPACE"
	kubectl create configmap gpu-allocs -n "$FMA_NAMESPACE"
	echo "ConfigMaps created"

	- name: Create node-viewer ClusterRole
	run: \|
	echo "Creating node-viewer ClusterRole..."
	kubectl create clusterrole fma-node-viewer --verb=get,list,watch --resource=nodes
	echo "ClusterRole created"

	- name: Detect ValidatingAdmissionPolicy support
	id: detect-vap
	run: \|
	POLICIES_ENABLED=false
	if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
	\| grep -q 'validatingadmissionpolicies'; then
	POLICIES_ENABLED=true
	fi
	echo "ValidatingAdmissionPolicy support: $POLICIES_ENABLED"
	echo "policies_enabled=$POLICIES_ENABLED" >> $GITHUB_OUTPUT

	- name: Deploy FMA controller
	env:
	POLICIES_ENABLED: ${{ steps.detect-vap.outputs.policies_enabled }}
	run: \|
	echo "Deploying FMA controller..."
	echo " Release: $FMA_RELEASE_NAME"
	echo " Namespace: $FMA_NAMESPACE"
	echo " Image: $CONTROLLER_IMAGE"
	echo " EnableValidationPolicy: $POLICIES_ENABLED"

	helm upgrade --install "$FMA_RELEASE_NAME" charts/fma-controllers \
	-n "$FMA_NAMESPACE" \
	--set global.imageRegistry="${CONTROLLER_IMAGE%/dual-pods-controller:*}" \
	--set global.imageTag="${CONTROLLER_IMAGE##*:}" \
	--set global.nodeViewClusterRole=fma-node-viewer \
	--set dualPodsController.sleeperLimit=2 \
	--set global.local=false \
	--set dualPodsController.debugAcceleratorMemory=false \
	--set global.enableValidationPolicy="$POLICIES_ENABLED" \
	--set launcherPopulator.enabled=false

	- name: Wait for controller to be ready
	run: \|
	echo "Waiting for FMA controller deployment to be ready..."
	kubectl wait --for=condition=available --timeout=120s \
	deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"

	echo ""
	echo "=== Controller Pod Status ==="
	kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/name=fma-controllers
	echo ""
	echo "=== Controller Deployment ==="
	kubectl get deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"

	- name: Verify controller health
	run: \|
	echo "Checking controller pod for issues..."

	# Get the controller pod name
	POD_NAME=$(kubectl get pods -n "$FMA_NAMESPACE" \
	-l app.kubernetes.io/name=fma-controllers,app.kubernetes.io/component=dual-pods-controller \
	-o jsonpath='{.items[0].metadata.name}')

	if [ -z "$POD_NAME" ]; then
	echo "::error::No controller pod found"
	exit 1
	fi

	echo "Controller pod: $POD_NAME"

	# Check pod is Running
	PHASE=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" -o jsonpath='{.status.phase}')
	if [ "$PHASE" != "Running" ]; then
	echo "::error::Controller pod is in phase $PHASE, expected Running"
	kubectl describe pod "$POD_NAME" -n "$FMA_NAMESPACE"
	exit 1
	fi

	# Check for restarts
	RESTARTS=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" \
	-o jsonpath='{.status.containerStatuses[0].restartCount}')
	if [ "$RESTARTS" -gt 0 ]; then
	echo "::warning::Controller has restarted $RESTARTS time(s)"
	fi

	# Display recent logs
	echo ""
	echo "=== Controller Logs (last 50 lines) ==="
	kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" --tail=50

	# Check for fatal/panic in logs
	# klog FATAL lines: F followed by 4 digits (MMDD), e.g. "F0210 19:21:..."
	# Go panics: line starting with "panic:" (case sensitive)
	FATAL_LINES=$(kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" 2>&1 \
	\| grep -E "^F[0-9]{4} \|^panic:" \| head -5) \|\| true
	if [ -n "$FATAL_LINES" ]; then
	echo "::error::Controller logs contain FATAL or panic messages:"
	echo "$FATAL_LINES"
	exit 1
	fi

	echo ""
	echo "Controller health check passed"

	- name: Set up test RBAC
	run: \|
	echo "Creating RBAC for test workloads..."

	kubectl apply -n "$FMA_NAMESPACE" -f - <<'EOF'
	apiVersion: rbac.authorization.k8s.io/v1
	kind: Role
	metadata:
	name: testreq
	rules:
	- apiGroups: ["fma.llm-d.ai"]
	resources: ["inferenceserverconfigs", "launcherconfigs"]
	verbs: ["get", "list", "watch"]
	- apiGroups: [""]
	resources: ["configmaps"]
	resourceNames: ["gpu-map", "gpu-allocs"]
	verbs: ["update", "patch", "get", "list", "watch"]
	- apiGroups: [""]
	resources: ["configmaps"]
	verbs: ["create"]
	EOF

	kubectl create sa testreq -n "$FMA_NAMESPACE" \|\| true
	kubectl patch serviceaccount testreq -n "$FMA_NAMESPACE" \
	-p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
	kubectl create rolebinding testreq \
	--role=testreq --serviceaccount="${FMA_NAMESPACE}:testreq" \
	-n "$FMA_NAMESPACE" \|\| true
	kubectl create clusterrolebinding "testreq-view-${FMA_NAMESPACE}" \
	--clusterrole=view --serviceaccount="${FMA_NAMESPACE}:testreq" \|\| true

	echo "Test RBAC created"

	- name: Populate GPU map
	run: \|
	echo "Populating gpu-map from cluster nodes..."

	# Patch gpu-map one node at a time (matching the KinD test approach).
	# Each entry maps GPU UUID to device index, e.g. {"GPU-0": 0, "GPU-1": 1}.
	gpu_idx=0
	found_gpus=false
	for node in $(kubectl get nodes -l nvidia.com/gpu.present=true -o name 2>/dev/null \| sed 's%^node/%%'); do
	gpu_count=$(kubectl get node "$node" -o jsonpath='{.status.capacity.nvidia\.com/gpu}' 2>/dev/null \|\| echo "0")
	if [ "$gpu_count" -gt 0 ]; then
	found_gpus=true
	# Build JSON map: {"GPU-0": 0, "GPU-1": 1, ...}
	gpu_json="{"
	for i in $(seq 0 $((gpu_count - 1))); do
	[ "$i" -gt 0 ] && gpu_json="${gpu_json}, "
	gpu_json="${gpu_json}\"GPU-${gpu_idx}\": ${i}"
	gpu_idx=$((gpu_idx + 1))
	done
	gpu_json="${gpu_json}}"
	echo " Node $node: $gpu_count GPUs -> $gpu_json"

	# Patch one node at a time using YAML format (like the KinD test)
	kubectl patch configmap gpu-map -n "$FMA_NAMESPACE" \
	-p "$(printf 'data:\n %s: '"'"'%s'"'"'' "$node" "$gpu_json")"
	fi
	done

	if [ "$found_gpus" = false ]; then
	echo "::warning::No GPU nodes found. Test objects requesting GPUs will remain Pending."
	fi

	# Verify gpu-map contents
	echo "gpu-map contents:"
	kubectl get configmap gpu-map -n "$FMA_NAMESPACE" -o yaml

	# Create gpu-allocs if not already present
	kubectl get configmap gpu-allocs -n "$FMA_NAMESPACE" &>/dev/null \|\| \
	kubectl create configmap gpu-allocs -n "$FMA_NAMESPACE"

	- name: Create test objects
	id: test-objects
	run: \|
	INST=$(date +%d-%H-%M-%S)
	echo "Creating test objects with instance: $INST"

	kubectl apply -n "$FMA_NAMESPACE" -f - <<EOF
	apiVersion: fma.llm-d.ai/v1alpha1
	kind: InferenceServerConfig
	metadata:
	name: inference-server-config-${INST}
	spec:
	modelServerConfig:
	port: 8005
	options: "--model TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	env_vars:
	VLLM_SERVER_DEV_MODE: "1"
	VLLM_USE_V1: "1"
	VLLM_LOGGING_LEVEL: "DEBUG"
	labels:
	component: inference
	annotations:
	description: "E2E test InferenceServerConfig"
	launcherConfigName: launcher-config-${INST}
	---
	apiVersion: fma.llm-d.ai/v1alpha1
	kind: LauncherConfig
	metadata:
	name: launcher-config-${INST}
	spec:
	maxSleepingInstances: 3
	podTemplate:
	spec:
	imagePullSecrets:
	- name: ghcr-pull-secret
	containers:
	- name: inference-server
	image: ${TEST_LAUNCHER_IMAGE}
	imagePullPolicy: Always
	command:
	- /bin/bash
	- "-c"
	args:
	- \|
	uvicorn launcher:app \
	--host 0.0.0.0 \
	--log-level info \
	--port 8001
	---
	apiVersion: apps/v1
	kind: ReplicaSet
	metadata:
	name: my-request-${INST}
	labels:
	app: dp-example
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: dp-example
	template:
	metadata:
	labels:
	app: dp-example
	instance: "${INST}"
	annotations:
	dual-pods.llm-d.ai/admin-port: "8081"
	dual-pods.llm-d.ai/inference-server-config: "inference-server-config-${INST}"
	spec:
	imagePullSecrets:
	- name: ghcr-pull-secret
	containers:
	- name: inference-server
	image: ${TEST_REQUESTER_IMAGE}
	imagePullPolicy: Always
	command:
	- /ko-app/test-requester
	- --node=\$(NODE_NAME)
	- --pod-uid=\$(POD_UID)
	- --namespace=\$(NAMESPACE)
	env:
	- name: NODE_NAME
	valueFrom:
	fieldRef: { fieldPath: spec.nodeName }
	- name: POD_UID
	valueFrom:
	fieldRef: { fieldPath: metadata.uid }
	- name: NAMESPACE
	valueFrom:
	fieldRef: { fieldPath: metadata.namespace }
	ports:
	- name: probes
	containerPort: 8080
	- name: spi
	containerPort: 8081
	readinessProbe:
	httpGet:
	path: /ready
	port: 8080
	initialDelaySeconds: 2
	periodSeconds: 5
	resources:
	limits:
	nvidia.com/gpu: "1"
	cpu: "200m"
	memory: 250Mi
	serviceAccount: testreq
	EOF

	echo "instance=${INST}" >> $GITHUB_OUTPUT
	echo "isc=inference-server-config-${INST}" >> $GITHUB_OUTPUT
	echo "lc=launcher-config-${INST}" >> $GITHUB_OUTPUT
	echo "rs=my-request-${INST}" >> $GITHUB_OUTPUT
	echo "Test objects created"

	- name: Verify launcher pod creation and binding
	env:
	INST: ${{ steps.test-objects.outputs.instance }}
	LC: ${{ steps.test-objects.outputs.lc }}
	run: \|
	echo "Waiting for requester pod..."
	ELAPSED=0
	LIMIT=300
	while true; do
	COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o name 2>/dev/null \| wc -l \| tr -d ' ')
	if [ "$COUNT" -ge 1 ]; then
	echo "Requester pod found"
	break
	fi
	if [ "$ELAPSED" -ge "$LIMIT" ]; then
	echo "::error::Requester pod did not appear within ${LIMIT}s"
	kubectl get pods -n "$FMA_NAMESPACE" -o wide
	kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' \| tail -20
	exit 1
	fi
	sleep 5
	ELAPSED=$((ELAPSED + 5))
	done

	REQUESTER=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o name \| head -1 \| sed 's%pod/%%')
	echo "Requester pod: $REQUESTER"

	echo "Waiting for launcher pod..."
	ELAPSED=0
	while true; do
	COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o name 2>/dev/null \| wc -l \| tr -d ' ')
	if [ "$COUNT" -ge 1 ]; then
	echo "Launcher pod found"
	break
	fi
	if [ "$ELAPSED" -ge "$LIMIT" ]; then
	echo "::error::Launcher pod did not appear within ${LIMIT}s"
	kubectl get pods -n "$FMA_NAMESPACE" -o wide
	kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' \| tail -20
	echo "=== Controller logs ==="
	kubectl logs deployment/"$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE" --tail=50 \|\| true
	echo "=== Test-requester logs ==="
	kubectl logs "$REQUESTER" -n "$FMA_NAMESPACE" --tail=50 \|\| true
	echo "=== gpu-map contents ==="
	kubectl get configmap gpu-map -n "$FMA_NAMESPACE" -o yaml \|\| true
	echo "=== gpu-allocs contents ==="
	kubectl get configmap gpu-allocs -n "$FMA_NAMESPACE" -o yaml \|\| true
	exit 1
	fi
	sleep 5
	ELAPSED=$((ELAPSED + 5))
	done

	LAUNCHER=$(kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o name \| head -1 \| sed 's%pod/%%')
	echo "Launcher pod: $LAUNCHER"

	# Wait for launcher pod to be Ready (proves launcher API is serving)
	echo "Waiting for launcher pod to be Ready..."
	kubectl wait --for=condition=Ready "pod/$LAUNCHER" -n "$FMA_NAMESPACE" --timeout=120s

	# Verify the controller bound the launcher to the requester.
	# The controller sets the dual label on the launcher pod during bind().
	# Note: The requester's dual label requires a successful wake_up call to the
	# model server (port 8005), which needs a running vLLM instance. The test-launcher
	# CPU image may not start a model server on GPU nodes, so we only verify the
	# launcher->requester direction of the binding.
	echo "Verifying launcher-to-requester binding..."
	ELAPSED=0
	while true; do
	LAUNCHER_DUAL=$(kubectl get pod "$LAUNCHER" -n "$FMA_NAMESPACE" -o jsonpath='{.metadata.labels.dual-pods\.llm-d\.ai/dual}' 2>/dev/null \|\| true)
	if [ "$LAUNCHER_DUAL" = "$REQUESTER" ]; then
	echo "Launcher bound to requester: $LAUNCHER -> $REQUESTER"
	break
	fi
	if [ "$ELAPSED" -ge "$LIMIT" ]; then
	echo "::error::Launcher-to-requester binding not established within ${LIMIT}s"
	echo " Launcher dual label: '$LAUNCHER_DUAL' (expected: '$REQUESTER')"
	kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
	echo "=== Controller logs ==="
	kubectl logs deployment/"$FMA_RELEASE_NAME" -n "$FMA_NAMESPACE" --tail=100 \|\| true
	exit 1
	fi
	sleep 5
	ELAPSED=$((ELAPSED + 5))
	done

	echo ""
	echo "=== Launcher test passed: pods created and launcher bound to requester ==="
	kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels

	- name: List objects of category all
	if: always()
	run: kubectl get all -n "$FMA_NAMESPACE"

	- name: List event objects
	if: always()
	run: kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp'

	- name: Dump Pod logs
	if: always()
	run: \|
	for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o 'jsonpath={.items[*].metadata.name} ') ; do
	echo ""
	echo "=== Previous log of $pod ==="
	kubectl logs -n "$FMA_NAMESPACE" $pod --previous \|\| true
	echo ""
	echo "=== Log of $pod ==="
	kubectl logs -n "$FMA_NAMESPACE" $pod \|\| true
	done

	- name: Clean up test objects
	if: always()
	env:
	ISC: ${{ steps.test-objects.outputs.isc }}
	LC: ${{ steps.test-objects.outputs.lc }}
	RS: ${{ steps.test-objects.outputs.rs }}
	run: \|
	echo "Cleaning up test objects..."
	kubectl delete rs "$RS" -n "$FMA_NAMESPACE" --ignore-not-found \|\| true
	kubectl delete inferenceserverconfig "$ISC" -n "$FMA_NAMESPACE" --ignore-not-found \|\| true
	kubectl delete launcherconfig "$LC" -n "$FMA_NAMESPACE" --ignore-not-found \|\| true
	# Wait for test pods to terminate
	sleep 10
	echo "Test objects cleaned up"

	- name: Cleanup infrastructure
	# Cleanup on success or cancellation, but NOT on failure (preserve for debugging)
	if: (success() \|\| cancelled()) && env.SKIP_CLEANUP != 'true'
	run: \|
	echo "Cleaning up all FMA test infrastructure..."
	echo " FMA_NAMESPACE: $FMA_NAMESPACE"
	echo " FMA_RELEASE_NAME: $FMA_RELEASE_NAME"

	# Uninstall Helm releases
	for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
	echo " Uninstalling helm release: $release"
	helm uninstall "$release" -n "$FMA_NAMESPACE" --ignore-not-found --wait --timeout 60s \|\| true
	done

	# Remove dual-pods.llm-d.ai/* finalizers from all pods so namespace deletion is not blocked
	echo " Removing dual-pods finalizers from pods in $FMA_NAMESPACE..."
	for pod in $(kubectl get pods -n "$FMA_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
	all_finalizers=$(kubectl get pod "$pod" -n "$FMA_NAMESPACE" \
	-o jsonpath='{range .metadata.finalizers[*]}{@}{"\n"}{end}' 2>/dev/null \|\| true)
	if ! echo "$all_finalizers" \| grep -q '^dual-pods\.llm-d\.ai/'; then
	continue
	fi
	echo " Patching pod $pod to remove dual-pods finalizers"
	keep_entries=$(echo "$all_finalizers" \
	\| grep -v '^dual-pods\.llm-d\.ai/' \
	\| awk 'NR>1{printf ","} {printf "\"%s\"", $0}')
	kubectl patch pod "$pod" -n "$FMA_NAMESPACE" --type=merge \
	-p="{\"metadata\":{\"finalizers\":[${keep_entries}]}}" 2>/dev/null \|\| true
	done

	# Delete namespace
	kubectl delete namespace "$FMA_NAMESPACE" \
	--ignore-not-found --timeout=120s \|\| true

	# Delete CRDs
	# TODO: Implement safe CRD lifecycle management for tests (e.g., handle shared clusters,
	# concurrent test runs, and version upgrades/downgrades) before enabling CRD deletion.
	# kubectl delete -f config/crd/ --ignore-not-found \|\| true

	# Delete cluster-scoped resources
	kubectl delete clusterrole fma-node-viewer --ignore-not-found \|\| true
	kubectl delete clusterrolebinding "$FMA_RELEASE_NAME-node-view" --ignore-not-found \|\| true
	kubectl delete clusterrolebinding "testreq-view-${FMA_NAMESPACE}" --ignore-not-found \|\| true

	# Delete ValidatingAdmissionPolicy resources (cluster-scoped)
	kubectl delete validatingadmissionpolicy fma-bound-serverreqpod fma-immutable-fields --ignore-not-found 2>/dev/null \|\| true
	kubectl delete validatingadmissionpolicybinding bind-fma-bound-serverreqpod bind-fma-immutable-fields --ignore-not-found 2>/dev/null \|\| true

	echo "Cleanup complete"

	- name: Scale down controller on failure
	if: failure()
	run: \|
	echo "Test failed - scaling down controller to free resources while preserving for debugging..."
	kubectl scale deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE" --replicas=0 \|\| true

	# Report status back to PR for issue_comment triggered runs
	# This ensures fork PRs show the correct status after /ok-to-test runs complete
	report-status:
	runs-on: ubuntu-latest
	needs: [gate, e2e-openshift]
	# Run always (even on failure) but only for issue_comment events
	if: always() && github.event_name == 'issue_comment' && needs.gate.outputs.should_run == 'true'
	steps:
	- name: Report status to PR
	uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
	with:
	script: \|
	const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
	const e2eResult = '${{ needs.e2e-openshift.result }}';

	// Map job result to commit status
	let state, description;
	if (e2eResult === 'success') {
	state = 'success';
	description = 'E2E tests passed';
	} else if (e2eResult === 'skipped') {
	state = 'pending';
	description = 'E2E tests skipped';
	} else if (e2eResult === 'cancelled') {
	state = 'failure';
	description = 'E2E tests cancelled';
	} else {
	state = 'failure';
	description = 'E2E tests failed';
	}

	console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);

	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner,
	repo: context.repo.repo,
	sha: prHeadSha,
	state: state,
	target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	description: description,
	context: '${{ github.workflow }} / e2e (comment trigger)'
	});

	console.log('Status reported successfully');

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Test cases for multiple instances sharing one launcher pod #284

Workflow file

Test cases for multiple instances sharing one launcher pod #284

Uh oh!

Workflow file for this run