Probe for a 2-GPU node before running E2E test cases

MikeSpreitzer · claude · MikeSpreitzer · commit 0ec0d9e6621a · 2026-04-15T13:25:13.000-04:00
Before creating any test objects, launch a throwaway Pod that requests 2 GPUs. The scheduler places it on a node that actually has 2 GPUs free right now. Record that node, delete the probe, and pin the requester ReplicaSet to it via a new --node flag accepted by both mkobjs scripts. This prevents spurious failures on shared clusters where GPU availability is dynamic (Issue #422). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
diff --git a/test/e2e/mkobjs-openshift.sh b/test/e2e/mkobjs-openshift.sh
@@ -2,7 +2,7 @@
 
 # Creates test Kubernetes objects for the OpenShift / real-cluster E2E path.
 #
-# Usage: mkobjs-openshift.sh [-n <namespace>]
+# Usage: mkobjs-openshift.sh [-n <namespace>] [--node <node-name>]
 #
 # Required environment variables:
 #   LAUNCHER_IMAGE   - container image for the launcher pod
@@ -17,8 +17,9 @@
 
 set -euo pipefail
 
-# Parse optional -n / --namespace flag
+# Parse optional flags
 ns_flag=()
+node_name=""
 while [ $# -gt 0 ]; do
     case "$1" in
         -n|--namespace)
@@ -30,6 +31,15 @@ while [ $# -gt 0 ]; do
                 exit 1
             fi
             ;;
+        --node)
+            if [ $# -gt 1 ] ; then
+                node_name="$2"
+                shift 2
+            else
+                echo "Missing --node argument" >&2
+                exit 1
+            fi
+            ;;
         *)
             echo "Unknown argument: $1" >&2
             exit 1
@@ -48,6 +58,14 @@ if [ -n "${RUNTIME_CLASS_NAME:-}" ]; then
     runtime_class="runtimeClassName: ${RUNTIME_CLASS_NAME}"
 fi
 
+# When a node is specified, pin the ReplicaSet's pods to it.
+if [ -n "$node_name" ]; then
+    node_selector="nodeSelector:
+        kubernetes.io/hostname: \"$node_name\""
+else
+    node_selector=""
+fi
+
 if out=$(kubectl apply "${ns_flag[@]}" -f - 2>&1 <<EOF
 apiVersion: v1
 kind: ServiceAccount
@@ -218,6 +236,7 @@ spec:
         dual-pods.llm-d.ai/inference-server-config: "inference-server-config-smol-$inst"
     spec:
       ${runtime_class}
+      ${node_selector}
       containers:
         - name: inference-server
           image: ${REQUESTER_IMAGE}
diff --git a/test/e2e/mkobjs.sh b/test/e2e/mkobjs.sh
@@ -1,7 +1,8 @@
 #!/usr/bin/env bash
 
-# Parse optional -n / --namespace flag
+# Parse optional flags
 ns_flag=()
+node_name=""
 while [ $# -gt 0 ]; do
     case "$1" in
         -n|--namespace)
@@ -13,13 +14,30 @@ while [ $# -gt 0 ]; do
                 exit 1
             fi
             ;;
+        --node)
+            if [ $# -gt 1 ] ; then
+                node_name="$2"
+                shift 2
+            else
+                echo "Missing --node argument" >&2
+                exit 1
+            fi
+            ;;
         *)
             echo "Unknown argument: $1" >&2
             exit 1
             ;;
     esac
 done
 
+# When a node is specified, pin the ReplicaSet's pods to it.
+if [ -n "$node_name" ]; then
+    node_selector="nodeSelector:
+        kubernetes.io/hostname: \"$node_name\""
+else
+    node_selector=""
+fi
+
 inst=$(date +%d-%H-%M-%S)
 requester_img=$(make echo-var VAR=TEST_REQUESTER_IMG)
 launcher_img=$(make echo-var VAR=TEST_LAUNCHER_IMG)
@@ -194,8 +212,8 @@ spec:
               nvidia.com/gpu: "1"
               cpu: "200m"
               memory: 250Mi
+      ${node_selector}
       serviceAccount: testreq
-      # nodeName: fmatest-worker # try fixed node for the consistency of value of dual-pods.llm-d.ai/launcher-config-hash annotation
 EOF
         )
 then
diff --git a/test/e2e/test-cases.sh b/test/e2e/test-cases.sh
@@ -102,13 +102,58 @@ check_gpu_pin() {
     echo "GPU UUID(s) verified on pod $pod: $actual_uuids"
 }
 
+# ---------------------------------------------------------------------------
+# Probe for a node with 2 free GPUs
+# ---------------------------------------------------------------------------
+# Create a throwaway Pod that requests 2 GPUs.  The scheduler will place it
+# on a node that actually has 2 GPUs available right now.  Once it is running
+# we record the node, delete the probe Pod, and pin every subsequent test
+# workload to that node.  This avoids spurious failures on shared clusters
+# where GPU availability is dynamic (Issue #422).
+
+intro_case GPU Probe
+
+probe_pod="gpu-probe-$(date +%d-%H-%M-%S)"
+
+if [ -n "${RUNTIME_CLASS_NAME:-}" ]; then
+    probe_runtime_class="runtimeClassName: ${RUNTIME_CLASS_NAME}"
+else
+    probe_runtime_class=""
+fi
+
+kubectl apply -n "$NS" -f - <<PROBE
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${probe_pod}
+  labels:
+    app: gpu-probe
+spec:
+  ${probe_runtime_class}
+  containers:
+  - name: pause
+    image: registry.k8s.io/pause:3.10.2
+    resources:
+      limits:
+        nvidia.com/gpu: "2"
+  terminationGracePeriodSeconds: 0
+PROBE
+
+expect '[ "$(kubectl get pod '"$probe_pod"' -n '"$NS"' -o jsonpath={.status.phase})" = "Running" ]'
+testnode=$(kubectl get pod "$probe_pod" -n "$NS" -o jsonpath='{.spec.nodeName}')
+echo "GPU probe Pod $probe_pod scheduled on Node $testnode — using it for the rest of the tests"
+
+kubectl delete pod "$probe_pod" -n "$NS" --wait=true
+
+cheer "GPU probe complete — test node is $testnode"
+
 # ---------------------------------------------------------------------------
 # Create test objects
 # ---------------------------------------------------------------------------
 
 intro_case Basic Launcher Pod Creation
 
-objs=$("$MKOBJS_SCRIPT" -n "$NS")
+objs=$("$MKOBJS_SCRIPT" -n "$NS" --node "$testnode")
 isc=$(echo $objs | awk '{print $1}')
 lc=$(echo $objs | awk '{print $2}')
 rs=$(echo $objs | awk '{print $3}')
@@ -147,8 +192,7 @@ expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$inst | wc -l
 
 export req1=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$inst | sed s%pod/%%)
 echo "Server-requesting Pod is $req1"
-testnode=$(kubectl get pod $req1 -n "$NS" -o jsonpath='{.spec.nodeName}')
-echo "The test Pods run on Node $testnode"
+[ "$(kubectl get pod $req1 -n "$NS" -o jsonpath='{.spec.nodeName}')" = "$testnode" ]
 
 # Wait for launcher-to-requester binding, then capture the launcher name
 expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$req1 | wc -l | grep -w 1"