Skip to content
162 changes: 19 additions & 143 deletions .github/workflows/ci-e2e-openshift.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -337,11 +337,13 @@ jobs:
# PR-specific namespace for isolation between concurrent PR tests
FMA_NAMESPACE: fma-e2e-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
# Unique release name per run to avoid conflicts
FMA_RELEASE_NAME: fma-e2e-${{ github.run_id }}
# Use the images built in the previous job
CONTROLLER_IMAGE: ${{ needs.build-image.outputs.controller_image }}
REQUESTER_IMAGE: ${{ needs.build-image.outputs.requester_image }}
FMA_CHART_INSTANCE_NAME: fma-e2e-${{ github.run_id }}
# Image registry and tag from the build job
IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
# LAUNCHER_IMAGE and REQUESTER_IMAGE are needed by test object creation
# and cleanup step (rm-images-from-ocp-nodes.sh)
LAUNCHER_IMAGE: ${{ needs.build-image.outputs.launcher_image }}
REQUESTER_IMAGE: ${{ needs.build-image.outputs.requester_image }}
steps:
- name: Checkout source
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
Expand Down Expand Up @@ -428,8 +430,8 @@ jobs:

# Clean up cluster-scoped resources from previous runs
echo "Cleaning up cluster-scoped resources..."
kubectl delete clusterrole "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrole "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true

echo "Cleanup complete"

Expand Down Expand Up @@ -464,142 +466,16 @@ jobs:
-p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
echo "GHCR pull secret created and attached to default SA"

- name: Apply FMA CRDs
run: |
CRD_NAMES=""
for crd_file in config/crd/*.yaml; do
crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
CRD_NAMES="$CRD_NAMES $crd_name"
if kubectl get crd "$crd_name" &>/dev/null; then
echo " CRD $crd_name already exists, skipping"
else
echo " Applying $crd_file ($crd_name)"
kubectl apply --server-side -f "$crd_file"
fi
done

# Wait for CRDs to become Established (API servers have digested the definitions)
echo "Waiting for CRDs to become Established..."
CRD_TIMEOUT=120s
for crd_name in $CRD_NAMES; do
kubectl wait --for=condition=Established "crd/$crd_name" --timeout="$CRD_TIMEOUT"
done
echo "All CRDs established"

- name: Create node-viewer ClusterRole
run: |
echo "Creating ClusterRole ${FMA_RELEASE_NAME}-node-view..."
kubectl create clusterrole ${FMA_RELEASE_NAME}-node-view --verb=get,list,watch --resource=nodes
echo "ClusterRole created"

- name: Detect ValidatingAdmissionPolicy support
id: detect-vap
run: |
POLICIES_ENABLED=false
if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
| grep -q 'validatingadmissionpolicies'; then
POLICIES_ENABLED=true
fi
echo "ValidatingAdmissionPolicy support: $POLICIES_ENABLED"
echo "policies_enabled=$POLICIES_ENABLED" >> $GITHUB_OUTPUT

- name: Apply ValidatingAdmissionPolicy resources
- name: Deploy FMA (CRDs and controllers)
id: deploy-fma
env:
POLICIES_ENABLED: ${{ steps.detect-vap.outputs.policies_enabled }}
run: |
if [ "$POLICIES_ENABLED" = "true" ]; then
echo "Applying ValidatingAdmissionPolicy resources..."
kubectl apply -f config/validating-admission-policies/
else
echo "ValidatingAdmissionPolicy not supported, skipping."
fi

- name: Deploy FMA controller
run: |
echo "Deploying FMA controller..."
echo " Release: $FMA_RELEASE_NAME"
echo " Namespace: $FMA_NAMESPACE"
echo " Image: $CONTROLLER_IMAGE"

helm upgrade --install "$FMA_RELEASE_NAME" charts/fma-controllers \
-n "$FMA_NAMESPACE" \
--set global.imageRegistry="${CONTROLLER_IMAGE%/dual-pods-controller:*}" \
--set global.imageTag="${CONTROLLER_IMAGE##*:}" \
--set global.nodeViewClusterRole=${FMA_RELEASE_NAME}-node-view \
--set dualPodsController.sleeperLimit=2 \
--set global.local=false \
--set dualPodsController.debugAcceleratorMemory=false \
--set launcherPopulator.enabled=true

- name: Wait for FMA controllers to be ready
run: |
kubectl wait --for=condition=available --timeout=120s \
deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
echo ""
echo "=== Dual-Pod Controller Pod Status ==="
kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=dual-pods-controller
echo ""
echo "=== Dual-Pod Controller Deployment ==="
kubectl get deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"

kubectl wait --for=condition=available --timeout=120s \
deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
echo ""
echo "=== Launcher Populator Pod Status ==="
kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=launcher-populator
echo ""
echo "=== Launcher Populator Deployment ==="
kubectl get deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"

- name: Verify controller health
CONTAINER_IMG_REG: ghcr.io/${{ github.repository }}
IMAGE_TAG: ${{ env.IMAGE_TAG }}
run: |
echo "Checking controller pod for issues..."

# Get the controller pod name
POD_NAME=$(kubectl get pods -n "$FMA_NAMESPACE" \
-l app.kubernetes.io/name=fma-controllers,app.kubernetes.io/component=dual-pods-controller \
-o jsonpath='{.items[0].metadata.name}')

if [ -z "$POD_NAME" ]; then
echo "::error::No controller pod found"
exit 1
fi

echo "Controller pod: $POD_NAME"

# Check pod is Running
PHASE=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" -o jsonpath='{.status.phase}')
if [ "$PHASE" != "Running" ]; then
echo "::error::Controller pod is in phase $PHASE, expected Running"
kubectl describe pod "$POD_NAME" -n "$FMA_NAMESPACE"
exit 1
fi

# Check for restarts
RESTARTS=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" \
-o jsonpath='{.status.containerStatuses[0].restartCount}')
if [ "$RESTARTS" -gt 0 ]; then
echo "::warning::Controller has restarted $RESTARTS time(s)"
fi

# Display recent logs
echo ""
echo "=== Controller Logs (last 50 lines) ==="
kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" --tail=50

# Check for fatal/panic in logs
# klog FATAL lines: F followed by 4 digits (MMDD), e.g. "F0210 19:21:..."
# Go panics: line starting with "panic:" (case sensitive)
FATAL_LINES=$(kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" 2>&1 \
| grep -E "^F[0-9]{4} |^panic:" | head -5) || true
if [ -n "$FATAL_LINES" ]; then
echo "::error::Controller logs contain FATAL or panic messages:"
echo "$FATAL_LINES"
exit 1
fi

echo ""
echo "Controller health check passed"
# Ensure registry is lowercase (GitHub requirement)
export CONTAINER_IMG_REG="${CONTAINER_IMG_REG,,}"
echo "Running deploy_fma.sh..."
./test/e2e/deploy_fma.sh

- name: Set up test service account
run: |
Expand Down Expand Up @@ -900,7 +776,7 @@ jobs:
run: |
echo "Cleaning up all FMA test infrastructure..."
echo " FMA_NAMESPACE: $FMA_NAMESPACE"
echo " FMA_RELEASE_NAME: $FMA_RELEASE_NAME"
echo " FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"

# Uninstall Helm releases
for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
Expand Down Expand Up @@ -929,8 +805,8 @@ jobs:
--ignore-not-found --timeout=120s || true

# Delete cluster-scoped stuff for reading Node objects
kubectl delete clusterrole "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrole "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true

echo "Cleanup complete"

Expand Down
2 changes: 1 addition & 1 deletion charts/fma-controllers/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dualPodsController:
# Whether to debug the accelerator memory usage.
# This involves querying the requester;
# the test-requester does not support the query.
debugAcceleratorMemory: true
debugAcceleratorMemory: false

# Launcher populator controller configuration
launcherPopulator:
Expand Down
187 changes: 187 additions & 0 deletions test/e2e/deploy_fma.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env bash

# Usage: $0
# Current working directory must be the root of the Git repository.
#
# Deploys the FMA controllers (dual-pods controller + launcher-populator)
# and waits for them to be available.
#
# Required environment variables:
# FMA_NAMESPACE - target Kubernetes namespace
# FMA_CHART_INSTANCE_NAME - Helm chart instance name
# CONTAINER_IMG_REG - container image registry/namespace
# (e.g. ghcr.io/llm-d-incubation/llm-d-fast-model-actuation)
# IMAGE_TAG - image tag for all components
# (e.g. ref-abcd1234)
#
# Optional environment variables:
# NODE_VIEW_CLUSTER_ROLE - ClusterRole granting node read access.
# If unset, the script creates one named
# "${FMA_CHART_INSTANCE_NAME}-node-view".
# If set to an existing ClusterRole name, it is
# used as-is (no creation).
# If set to "none", no ClusterRole is configured.
# RUNTIME_CLASS_NAME - if set, adds runtimeClassName to GPU pod specs
# (e.g. "nvidia" when the GPU operator requires it)
# POLICIES_ENABLED - "true"/"false"; auto-detected if unset
# FMA_DEBUG - "true" to enable shell tracing (set -x)
# HELM_EXTRA_ARGS - additional Helm arguments appended to the
# `helm upgrade --install` invocation
# (e.g. "--set global.local=true --set dualPodsController.sleeperLimit=4")

set -euo pipefail
if [ "${FMA_DEBUG:-false}" = "true" ]; then
set -x
fi

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

step_num=0
total_steps=6

step() {
step_num=$((step_num + 1))
echo ""
echo "========================================"
echo "[deploy_fma] Step ${step_num}/${total_steps}: $*"
echo "========================================"
echo ""
}

# ---------------------------------------------------------------------------
# Step 1: Validate required environment variables
# ---------------------------------------------------------------------------

step "Validate required environment variables"

missing=()
for var in FMA_NAMESPACE FMA_CHART_INSTANCE_NAME CONTAINER_IMG_REG IMAGE_TAG; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done

if [ ${#missing[@]} -gt 0 ]; then
echo "ERROR: Missing required environment variables: ${missing[*]}" >&2
exit 1
fi

echo "Configuration:"
echo " FMA_NAMESPACE: $FMA_NAMESPACE"
echo " FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
echo " CONTAINER_IMG_REG: $CONTAINER_IMG_REG"
echo " IMAGE_TAG: $IMAGE_TAG"
echo " NODE_VIEW_CLUSTER_ROLE: ${NODE_VIEW_CLUSTER_ROLE:-<will create>}"
echo " RUNTIME_CLASS_NAME: ${RUNTIME_CLASS_NAME:-<unset>}"
echo " POLICIES_ENABLED: ${POLICIES_ENABLED:-<auto-detect>}"
echo " HELM_EXTRA_ARGS: ${HELM_EXTRA_ARGS:-<none>}"

# ---------------------------------------------------------------------------
# Step 2: Apply FMA CRDs
# ---------------------------------------------------------------------------

step "Apply FMA CRDs"

CRD_NAMES=""
for crd_file in config/crd/*.yaml; do
crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
CRD_NAMES="$CRD_NAMES $crd_name"
if kubectl get crd "$crd_name" &>/dev/null; then
echo " CRD $crd_name already exists, skipping"
else
echo " Applying $crd_file ($crd_name)"
kubectl apply --server-side -f "$crd_file"
fi
done

echo "Waiting for CRDs to become Established..."
for crd_name in $CRD_NAMES; do
kubectl wait --for=condition=Established "crd/$crd_name" --timeout=120s
done
echo "All CRDs established"

# ---------------------------------------------------------------------------
# Step 3: Create node-viewer ClusterRole
# ---------------------------------------------------------------------------

step "Configure node-viewer ClusterRole"

if [ "${NODE_VIEW_CLUSTER_ROLE:-}" = "none" ]; then
CLUSTER_ROLE_NAME=""
echo "Skipped (NODE_VIEW_CLUSTER_ROLE=none)"
elif [ -n "${NODE_VIEW_CLUSTER_ROLE:-}" ]; then
CLUSTER_ROLE_NAME="${NODE_VIEW_CLUSTER_ROLE}"
echo "Using existing ClusterRole: $CLUSTER_ROLE_NAME"
else
CLUSTER_ROLE_NAME="${FMA_CHART_INSTANCE_NAME}-node-view"
if kubectl get clusterrole "$CLUSTER_ROLE_NAME" &>/dev/null; then
echo "ClusterRole $CLUSTER_ROLE_NAME already exists, skipping"
else
kubectl create clusterrole "$CLUSTER_ROLE_NAME" --verb=get,list,watch --resource=nodes
echo "ClusterRole $CLUSTER_ROLE_NAME created"
fi
fi

# ---------------------------------------------------------------------------
# Step 4: Detect and apply ValidatingAdmissionPolicies
# ---------------------------------------------------------------------------

step "ValidatingAdmissionPolicies"

if [ -z "${POLICIES_ENABLED:-}" ]; then
POLICIES_ENABLED=false
if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
| grep -q 'validatingadmissionpolicies'; then
POLICIES_ENABLED=true
fi
echo "Auto-detected POLICIES_ENABLED=$POLICIES_ENABLED"
fi

if [ "$POLICIES_ENABLED" = "true" ]; then
echo "Applying ValidatingAdmissionPolicy resources..."
kubectl apply -f config/validating-admission-policies/
else
echo "ValidatingAdmissionPolicy not supported or disabled, skipping"
fi

# ---------------------------------------------------------------------------
# Step 5: Deploy FMA controllers via Helm
# ---------------------------------------------------------------------------

step "Deploy FMA controllers via Helm"

HELM_ARGS=(
--set global.imageRegistry="${CONTAINER_IMG_REG}"
--set global.imageTag="${IMAGE_TAG}"
)

# Append any caller-supplied Helm arguments (e.g. --set global.local=true)
if [ -n "${HELM_EXTRA_ARGS:-}" ]; then
read -ra _extra <<< "$HELM_EXTRA_ARGS"
HELM_ARGS+=("${_extra[@]}")
fi

if [ -n "$CLUSTER_ROLE_NAME" ]; then
HELM_ARGS+=(--set global.nodeViewClusterRole="${CLUSTER_ROLE_NAME}")
fi

helm upgrade --install "$FMA_CHART_INSTANCE_NAME" charts/fma-controllers \
-n "$FMA_NAMESPACE" \
"${HELM_ARGS[@]}"

# ---------------------------------------------------------------------------
# Step 6: Wait for controllers to be ready
# ---------------------------------------------------------------------------

step "Wait for controllers to be ready"

kubectl wait --for=condition=available --timeout=120s \
deployment "${FMA_CHART_INSTANCE_NAME}-dual-pods-controller" -n "$FMA_NAMESPACE"
kubectl wait --for=condition=available --timeout=120s \
deployment "${FMA_CHART_INSTANCE_NAME}-launcher-populator" -n "$FMA_NAMESPACE"
echo "Both controllers are available"

echo ""
echo "[deploy_fma] All steps completed successfully"
Loading