llm-d-incubation · diegocastanibm · Mar 19, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/.github/workflows/ci-e2e-openshift.yaml b/.github/workflows/ci-e2e-openshift.yaml
@@ -337,11 +337,13 @@ jobs:
       # PR-specific namespace for isolation between concurrent PR tests
       FMA_NAMESPACE: fma-e2e-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
       # Unique release name per run to avoid conflicts
-      FMA_RELEASE_NAME: fma-e2e-${{ github.run_id }}
-      # Use the images built in the previous job
-      CONTROLLER_IMAGE: ${{ needs.build-image.outputs.controller_image }}
-      REQUESTER_IMAGE: ${{ needs.build-image.outputs.requester_image }}
+      FMA_CHART_INSTANCE_NAME: fma-e2e-${{ github.run_id }}
+      # Image registry and tag from the build job
+      IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
+      # LAUNCHER_IMAGE and REQUESTER_IMAGE are needed by test object creation
+      # and cleanup step (rm-images-from-ocp-nodes.sh)
       LAUNCHER_IMAGE: ${{ needs.build-image.outputs.launcher_image }}
+      REQUESTER_IMAGE: ${{ needs.build-image.outputs.requester_image }}
     steps:
       - name: Checkout source
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -428,8 +430,8 @@ jobs:
 
           # Clean up cluster-scoped resources from previous runs
           echo "Cleaning up cluster-scoped resources..."
-          kubectl delete clusterrole        "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
-          kubectl delete clusterrolebinding "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
+          kubectl delete clusterrole        "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
+          kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
 
           echo "Cleanup complete"
 
@@ -464,142 +466,16 @@ jobs:
             -p '{"imagePullSecrets": [{"name": "ghcr-pull-secret"}]}'
           echo "GHCR pull secret created and attached to default SA"
 
-      - name: Apply FMA CRDs
-        run: |
-          CRD_NAMES=""
-          for crd_file in config/crd/*.yaml; do
-            crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
-            CRD_NAMES="$CRD_NAMES $crd_name"
-            if kubectl get crd "$crd_name" &>/dev/null; then
-              echo "  CRD $crd_name already exists, skipping"
-            else
-              echo "  Applying $crd_file ($crd_name)"
-              kubectl apply --server-side -f "$crd_file"
-            fi
-          done
-
-          # Wait for CRDs to become Established (API servers have digested the definitions)
-          echo "Waiting for CRDs to become Established..."
-          CRD_TIMEOUT=120s
-          for crd_name in $CRD_NAMES; do
-            kubectl wait --for=condition=Established "crd/$crd_name" --timeout="$CRD_TIMEOUT"
-          done
-          echo "All CRDs established"
-
-      - name: Create node-viewer ClusterRole
-        run: |
-          echo "Creating ClusterRole ${FMA_RELEASE_NAME}-node-view..."
-          kubectl create clusterrole ${FMA_RELEASE_NAME}-node-view --verb=get,list,watch --resource=nodes
-          echo "ClusterRole created"
-
-      - name: Detect ValidatingAdmissionPolicy support
-        id: detect-vap
-        run: |
-          POLICIES_ENABLED=false
-          if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
-             | grep -q 'validatingadmissionpolicies'; then
-            POLICIES_ENABLED=true
-          fi
-          echo "ValidatingAdmissionPolicy support: $POLICIES_ENABLED"
-          echo "policies_enabled=$POLICIES_ENABLED" >> $GITHUB_OUTPUT
-
-      - name: Apply ValidatingAdmissionPolicy resources
+      - name: Deploy FMA (CRDs and controllers)
+        id: deploy-fma
         env:
-          POLICIES_ENABLED: ${{ steps.detect-vap.outputs.policies_enabled }}
-        run: |
-          if [ "$POLICIES_ENABLED" = "true" ]; then
-            echo "Applying ValidatingAdmissionPolicy resources..."
-            kubectl apply -f config/validating-admission-policies/
-          else
-            echo "ValidatingAdmissionPolicy not supported, skipping."
-          fi
-
-      - name: Deploy FMA controller
-        run: |
-          echo "Deploying FMA controller..."
-          echo "  Release: $FMA_RELEASE_NAME"
-          echo "  Namespace: $FMA_NAMESPACE"
-          echo "  Image: $CONTROLLER_IMAGE"
-
-          helm upgrade --install "$FMA_RELEASE_NAME" charts/fma-controllers \
-            -n "$FMA_NAMESPACE" \
-            --set global.imageRegistry="${CONTROLLER_IMAGE%/dual-pods-controller:*}" \
-            --set global.imageTag="${CONTROLLER_IMAGE##*:}" \
-            --set global.nodeViewClusterRole=${FMA_RELEASE_NAME}-node-view \
-            --set dualPodsController.sleeperLimit=2 \
-            --set global.local=false \
-            --set dualPodsController.debugAcceleratorMemory=false \
-            --set launcherPopulator.enabled=true
-
-      - name: Wait for FMA controllers to be ready
-        run: |
-          kubectl wait --for=condition=available --timeout=120s \
-            deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
-          echo ""
-          echo "=== Dual-Pod Controller Pod Status ==="
-          kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=dual-pods-controller
-          echo ""
-          echo "=== Dual-Pod Controller Deployment ==="
-          kubectl get deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
-
-          kubectl wait --for=condition=available --timeout=120s \
-            deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
-          echo ""
-          echo "=== Launcher Populator Pod Status ==="
-          kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=launcher-populator
-          echo ""
-          echo "=== Launcher Populator Deployment ==="
-          kubectl get deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
-
-      - name: Verify controller health
+          CONTAINER_IMG_REG: ghcr.io/${{ github.repository }}
+          IMAGE_TAG: ${{ env.IMAGE_TAG }}
         run: |
-          echo "Checking controller pod for issues..."
-
-          # Get the controller pod name
-          POD_NAME=$(kubectl get pods -n "$FMA_NAMESPACE" \
-            -l app.kubernetes.io/name=fma-controllers,app.kubernetes.io/component=dual-pods-controller \
-            -o jsonpath='{.items[0].metadata.name}')
-
-          if [ -z "$POD_NAME" ]; then
-            echo "::error::No controller pod found"
-            exit 1
-          fi
-
-          echo "Controller pod: $POD_NAME"
-
-          # Check pod is Running
-          PHASE=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" -o jsonpath='{.status.phase}')
-          if [ "$PHASE" != "Running" ]; then
-            echo "::error::Controller pod is in phase $PHASE, expected Running"
-            kubectl describe pod "$POD_NAME" -n "$FMA_NAMESPACE"
-            exit 1
-          fi
-
-          # Check for restarts
-          RESTARTS=$(kubectl get pod "$POD_NAME" -n "$FMA_NAMESPACE" \
-            -o jsonpath='{.status.containerStatuses[0].restartCount}')
-          if [ "$RESTARTS" -gt 0 ]; then
-            echo "::warning::Controller has restarted $RESTARTS time(s)"
-          fi
-
-          # Display recent logs
-          echo ""
-          echo "=== Controller Logs (last 50 lines) ==="
-          kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" --tail=50
-
-          # Check for fatal/panic in logs
-          # klog FATAL lines: F followed by 4 digits (MMDD), e.g. "F0210 19:21:..."
-          # Go panics: line starting with "panic:" (case sensitive)
-          FATAL_LINES=$(kubectl logs "$POD_NAME" -n "$FMA_NAMESPACE" 2>&1 \
-            | grep -E "^F[0-9]{4} |^panic:" | head -5) || true
-          if [ -n "$FATAL_LINES" ]; then
-            echo "::error::Controller logs contain FATAL or panic messages:"
-            echo "$FATAL_LINES"
-            exit 1
-          fi
-
-          echo ""
-          echo "Controller health check passed"
+          # Ensure registry is lowercase (GitHub requirement)
+          export CONTAINER_IMG_REG="${CONTAINER_IMG_REG,,}"
+          echo "Running deploy_fma.sh..."
+          ./test/e2e/deploy_fma.sh
 
       - name: Set up test service account
         run: |
@@ -900,7 +776,7 @@ jobs:
         run: |
           echo "Cleaning up all FMA test infrastructure..."
           echo "  FMA_NAMESPACE: $FMA_NAMESPACE"
-          echo "  FMA_RELEASE_NAME: $FMA_RELEASE_NAME"
+          echo "  FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
 
           # Uninstall Helm releases
           for release in $(helm list -n "$FMA_NAMESPACE" -q 2>/dev/null); do
@@ -929,8 +805,8 @@ jobs:
             --ignore-not-found --timeout=120s || true
 
           # Delete cluster-scoped stuff for reading Node objects
-          kubectl delete clusterrole        "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
-          kubectl delete clusterrolebinding "${FMA_RELEASE_NAME}-node-view" --ignore-not-found || true
+          kubectl delete clusterrole        "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
+          kubectl delete clusterrolebinding "${FMA_CHART_INSTANCE_NAME}-node-view" --ignore-not-found || true
 
           echo "Cleanup complete"
 

diff --git a/charts/fma-controllers/values.yaml b/charts/fma-controllers/values.yaml
@@ -28,7 +28,7 @@ dualPodsController:
   # Whether to debug the accelerator memory usage.
   # This involves querying the requester;
   # the test-requester does not support the query.
-  debugAcceleratorMemory: true
+  debugAcceleratorMemory: false
 
 # Launcher populator controller configuration
 launcherPopulator:

diff --git a/test/e2e/deploy_fma.sh b/test/e2e/deploy_fma.sh
@@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+
+# Usage: $0
+# Current working directory must be the root of the Git repository.
+#
+# Deploys the FMA controllers (dual-pods controller + launcher-populator)
+# and waits for them to be available.
+#
+# Required environment variables:
+#   FMA_NAMESPACE              - target Kubernetes namespace
+#   FMA_CHART_INSTANCE_NAME    - Helm chart instance name
+#   CONTAINER_IMG_REG          - container image registry/namespace
+#                                (e.g. ghcr.io/llm-d-incubation/llm-d-fast-model-actuation)
+#   IMAGE_TAG                  - image tag for all components
+#                                (e.g. ref-abcd1234)
+#
+# Optional environment variables:
+#   NODE_VIEW_CLUSTER_ROLE - ClusterRole granting node read access.
+#                            If unset, the script creates one named
+#                            "${FMA_CHART_INSTANCE_NAME}-node-view".
+#                            If set to an existing ClusterRole name, it is
+#                            used as-is (no creation).
+#                            If set to "none", no ClusterRole is configured.
+#   RUNTIME_CLASS_NAME  - if set, adds runtimeClassName to GPU pod specs
+#                         (e.g. "nvidia" when the GPU operator requires it)
+#   POLICIES_ENABLED    - "true"/"false"; auto-detected if unset
+#   FMA_DEBUG            - "true" to enable shell tracing (set -x)
+#   HELM_EXTRA_ARGS     - additional Helm arguments appended to the
+#                         `helm upgrade --install` invocation
+#                         (e.g. "--set global.local=true --set dualPodsController.sleeperLimit=4")
+
+set -euo pipefail
+if [ "${FMA_DEBUG:-false}" = "true" ]; then
+    set -x
+fi
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+step_num=0
+total_steps=6
+
+step() {
+    step_num=$((step_num + 1))
+    echo ""
+    echo "========================================"
+    echo "[deploy_fma] Step ${step_num}/${total_steps}: $*"
+    echo "========================================"
+    echo ""
+}
+
+# ---------------------------------------------------------------------------
+# Step 1: Validate required environment variables
+# ---------------------------------------------------------------------------
+
+step "Validate required environment variables"
+
+missing=()
+for var in FMA_NAMESPACE FMA_CHART_INSTANCE_NAME CONTAINER_IMG_REG IMAGE_TAG; do
+    if [ -z "${!var:-}" ]; then
+        missing+=("$var")
+    fi
+done
+
+if [ ${#missing[@]} -gt 0 ]; then
+    echo "ERROR: Missing required environment variables: ${missing[*]}" >&2
+    exit 1
+fi
+
+echo "Configuration:"
+echo "  FMA_NAMESPACE:           $FMA_NAMESPACE"
+echo "  FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
+echo "  CONTAINER_IMG_REG:       $CONTAINER_IMG_REG"
+echo "  IMAGE_TAG:               $IMAGE_TAG"
+echo "  NODE_VIEW_CLUSTER_ROLE:  ${NODE_VIEW_CLUSTER_ROLE:-<will create>}"
+echo "  RUNTIME_CLASS_NAME:      ${RUNTIME_CLASS_NAME:-<unset>}"
+echo "  POLICIES_ENABLED:        ${POLICIES_ENABLED:-<auto-detect>}"
+echo "  HELM_EXTRA_ARGS:         ${HELM_EXTRA_ARGS:-<none>}"
+
+# ---------------------------------------------------------------------------
+# Step 2: Apply FMA CRDs
+# ---------------------------------------------------------------------------
+
+step "Apply FMA CRDs"
+
+CRD_NAMES=""
+for crd_file in config/crd/*.yaml; do
+    crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
+    CRD_NAMES="$CRD_NAMES $crd_name"
+    if kubectl get crd "$crd_name" &>/dev/null; then
+        echo "  CRD $crd_name already exists, skipping"
+    else
+        echo "  Applying $crd_file ($crd_name)"
+        kubectl apply --server-side -f "$crd_file"
+    fi
+done
+
+echo "Waiting for CRDs to become Established..."
+for crd_name in $CRD_NAMES; do
+    kubectl wait --for=condition=Established "crd/$crd_name" --timeout=120s
+done
+echo "All CRDs established"
+
+# ---------------------------------------------------------------------------
+# Step 3: Create node-viewer ClusterRole
+# ---------------------------------------------------------------------------
+
+step "Configure node-viewer ClusterRole"
+
+if [ "${NODE_VIEW_CLUSTER_ROLE:-}" = "none" ]; then
+    CLUSTER_ROLE_NAME=""
+    echo "Skipped (NODE_VIEW_CLUSTER_ROLE=none)"
+elif [ -n "${NODE_VIEW_CLUSTER_ROLE:-}" ]; then
+    CLUSTER_ROLE_NAME="${NODE_VIEW_CLUSTER_ROLE}"
+    echo "Using existing ClusterRole: $CLUSTER_ROLE_NAME"
+else
+    CLUSTER_ROLE_NAME="${FMA_CHART_INSTANCE_NAME}-node-view"
+    if kubectl get clusterrole "$CLUSTER_ROLE_NAME" &>/dev/null; then
+        echo "ClusterRole $CLUSTER_ROLE_NAME already exists, skipping"
+    else
+        kubectl create clusterrole "$CLUSTER_ROLE_NAME" --verb=get,list,watch --resource=nodes
+        echo "ClusterRole $CLUSTER_ROLE_NAME created"
+    fi
+fi
+
+# ---------------------------------------------------------------------------
+# Step 4: Detect and apply ValidatingAdmissionPolicies
+# ---------------------------------------------------------------------------
+
+step "ValidatingAdmissionPolicies"
+
+if [ -z "${POLICIES_ENABLED:-}" ]; then
+    POLICIES_ENABLED=false
+    if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
+       | grep -q 'validatingadmissionpolicies'; then
+        POLICIES_ENABLED=true
+    fi
+    echo "Auto-detected POLICIES_ENABLED=$POLICIES_ENABLED"
+fi
+
+if [ "$POLICIES_ENABLED" = "true" ]; then
+    echo "Applying ValidatingAdmissionPolicy resources..."
+    kubectl apply -f config/validating-admission-policies/
+else
+    echo "ValidatingAdmissionPolicy not supported or disabled, skipping"
+fi
+
+# ---------------------------------------------------------------------------
+# Step 5: Deploy FMA controllers via Helm
+# ---------------------------------------------------------------------------
+
+step "Deploy FMA controllers via Helm"
+
+HELM_ARGS=(
+    --set global.imageRegistry="${CONTAINER_IMG_REG}"
+    --set global.imageTag="${IMAGE_TAG}"
+)
+
+# Append any caller-supplied Helm arguments (e.g. --set global.local=true)
+if [ -n "${HELM_EXTRA_ARGS:-}" ]; then
+    read -ra _extra <<< "$HELM_EXTRA_ARGS"
+    HELM_ARGS+=("${_extra[@]}")
+fi
+
+if [ -n "$CLUSTER_ROLE_NAME" ]; then
+    HELM_ARGS+=(--set global.nodeViewClusterRole="${CLUSTER_ROLE_NAME}")
+fi
+
+helm upgrade --install "$FMA_CHART_INSTANCE_NAME" charts/fma-controllers \
+    -n "$FMA_NAMESPACE" \
+    "${HELM_ARGS[@]}"
+
+# ---------------------------------------------------------------------------
+# Step 6: Wait for controllers to be ready
+# ---------------------------------------------------------------------------
+
+step "Wait for controllers to be ready"
+
+kubectl wait --for=condition=available --timeout=120s \
+    deployment "${FMA_CHART_INSTANCE_NAME}-dual-pods-controller" -n "$FMA_NAMESPACE"
+kubectl wait --for=condition=available --timeout=120s \
+    deployment "${FMA_CHART_INSTANCE_NAME}-launcher-populator" -n "$FMA_NAMESPACE"
+echo "Both controllers are available"
+
+echo ""
+echo "[deploy_fma] All steps completed successfully"