Skip to content

Commit 0ec0d9e

Browse files
MikeSpreitzerclaude
andcommitted
Probe for a 2-GPU node before running E2E test cases
Before creating any test objects, launch a throwaway Pod that requests 2 GPUs. The scheduler places it on a node that actually has 2 GPUs free right now. Record that node, delete the probe, and pin the requester ReplicaSet to it via a new --node flag accepted by both mkobjs scripts. This prevents spurious failures on shared clusters where GPU availability is dynamic (Issue #422). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
1 parent 2993df4 commit 0ec0d9e

3 files changed

Lines changed: 88 additions & 7 deletions

File tree

test/e2e/mkobjs-openshift.sh

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Creates test Kubernetes objects for the OpenShift / real-cluster E2E path.
44
#
5-
# Usage: mkobjs-openshift.sh [-n <namespace>]
5+
# Usage: mkobjs-openshift.sh [-n <namespace>] [--node <node-name>]
66
#
77
# Required environment variables:
88
# LAUNCHER_IMAGE - container image for the launcher pod
@@ -17,8 +17,9 @@
1717

1818
set -euo pipefail
1919

20-
# Parse optional -n / --namespace flag
20+
# Parse optional flags
2121
ns_flag=()
22+
node_name=""
2223
while [ $# -gt 0 ]; do
2324
case "$1" in
2425
-n|--namespace)
@@ -30,6 +31,15 @@ while [ $# -gt 0 ]; do
3031
exit 1
3132
fi
3233
;;
34+
--node)
35+
if [ $# -gt 1 ] ; then
36+
node_name="$2"
37+
shift 2
38+
else
39+
echo "Missing --node argument" >&2
40+
exit 1
41+
fi
42+
;;
3343
*)
3444
echo "Unknown argument: $1" >&2
3545
exit 1
@@ -48,6 +58,14 @@ if [ -n "${RUNTIME_CLASS_NAME:-}" ]; then
4858
runtime_class="runtimeClassName: ${RUNTIME_CLASS_NAME}"
4959
fi
5060

61+
# When a node is specified, pin the ReplicaSet's pods to it.
62+
if [ -n "$node_name" ]; then
63+
node_selector="nodeSelector:
64+
kubernetes.io/hostname: \"$node_name\""
65+
else
66+
node_selector=""
67+
fi
68+
5169
if out=$(kubectl apply "${ns_flag[@]}" -f - 2>&1 <<EOF
5270
apiVersion: v1
5371
kind: ServiceAccount
@@ -218,6 +236,7 @@ spec:
218236
dual-pods.llm-d.ai/inference-server-config: "inference-server-config-smol-$inst"
219237
spec:
220238
${runtime_class}
239+
${node_selector}
221240
containers:
222241
- name: inference-server
223242
image: ${REQUESTER_IMAGE}

test/e2e/mkobjs.sh

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/usr/bin/env bash
22

3-
# Parse optional -n / --namespace flag
3+
# Parse optional flags
44
ns_flag=()
5+
node_name=""
56
while [ $# -gt 0 ]; do
67
case "$1" in
78
-n|--namespace)
@@ -13,13 +14,30 @@ while [ $# -gt 0 ]; do
1314
exit 1
1415
fi
1516
;;
17+
--node)
18+
if [ $# -gt 1 ] ; then
19+
node_name="$2"
20+
shift 2
21+
else
22+
echo "Missing --node argument" >&2
23+
exit 1
24+
fi
25+
;;
1626
*)
1727
echo "Unknown argument: $1" >&2
1828
exit 1
1929
;;
2030
esac
2131
done
2232

33+
# When a node is specified, pin the ReplicaSet's pods to it.
34+
if [ -n "$node_name" ]; then
35+
node_selector="nodeSelector:
36+
kubernetes.io/hostname: \"$node_name\""
37+
else
38+
node_selector=""
39+
fi
40+
2341
inst=$(date +%d-%H-%M-%S)
2442
requester_img=$(make echo-var VAR=TEST_REQUESTER_IMG)
2543
launcher_img=$(make echo-var VAR=TEST_LAUNCHER_IMG)
@@ -194,8 +212,8 @@ spec:
194212
nvidia.com/gpu: "1"
195213
cpu: "200m"
196214
memory: 250Mi
215+
${node_selector}
197216
serviceAccount: testreq
198-
# nodeName: fmatest-worker # try fixed node for the consistency of value of dual-pods.llm-d.ai/launcher-config-hash annotation
199217
EOF
200218
)
201219
then

test/e2e/test-cases.sh

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,58 @@ check_gpu_pin() {
102102
echo "GPU UUID(s) verified on pod $pod: $actual_uuids"
103103
}
104104

105+
# ---------------------------------------------------------------------------
106+
# Probe for a node with 2 free GPUs
107+
# ---------------------------------------------------------------------------
108+
# Create a throwaway Pod that requests 2 GPUs. The scheduler will place it
109+
# on a node that actually has 2 GPUs available right now. Once it is running
110+
# we record the node, delete the probe Pod, and pin every subsequent test
111+
# workload to that node. This avoids spurious failures on shared clusters
112+
# where GPU availability is dynamic (Issue #422).
113+
114+
intro_case GPU Probe
115+
116+
probe_pod="gpu-probe-$(date +%d-%H-%M-%S)"
117+
118+
if [ -n "${RUNTIME_CLASS_NAME:-}" ]; then
119+
probe_runtime_class="runtimeClassName: ${RUNTIME_CLASS_NAME}"
120+
else
121+
probe_runtime_class=""
122+
fi
123+
124+
kubectl apply -n "$NS" -f - <<PROBE
125+
apiVersion: v1
126+
kind: Pod
127+
metadata:
128+
name: ${probe_pod}
129+
labels:
130+
app: gpu-probe
131+
spec:
132+
${probe_runtime_class}
133+
containers:
134+
- name: pause
135+
image: registry.k8s.io/pause:3.10.2
136+
resources:
137+
limits:
138+
nvidia.com/gpu: "2"
139+
terminationGracePeriodSeconds: 0
140+
PROBE
141+
142+
expect '[ "$(kubectl get pod '"$probe_pod"' -n '"$NS"' -o jsonpath={.status.phase})" = "Running" ]'
143+
testnode=$(kubectl get pod "$probe_pod" -n "$NS" -o jsonpath='{.spec.nodeName}')
144+
echo "GPU probe Pod $probe_pod scheduled on Node $testnode — using it for the rest of the tests"
145+
146+
kubectl delete pod "$probe_pod" -n "$NS" --wait=true
147+
148+
cheer "GPU probe complete — test node is $testnode"
149+
105150
# ---------------------------------------------------------------------------
106151
# Create test objects
107152
# ---------------------------------------------------------------------------
108153

109154
intro_case Basic Launcher Pod Creation
110155

111-
objs=$("$MKOBJS_SCRIPT" -n "$NS")
156+
objs=$("$MKOBJS_SCRIPT" -n "$NS" --node "$testnode")
112157
isc=$(echo $objs | awk '{print $1}')
113158
lc=$(echo $objs | awk '{print $2}')
114159
rs=$(echo $objs | awk '{print $3}')
@@ -147,8 +192,7 @@ expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$inst | wc -l
147192

148193
export req1=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$inst | sed s%pod/%%)
149194
echo "Server-requesting Pod is $req1"
150-
testnode=$(kubectl get pod $req1 -n "$NS" -o jsonpath='{.spec.nodeName}')
151-
echo "The test Pods run on Node $testnode"
195+
[ "$(kubectl get pod $req1 -n "$NS" -o jsonpath='{.spec.nodeName}')" = "$testnode" ]
152196

153197
# Wait for launcher-to-requester binding, then capture the launcher name
154198
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$req1 | wc -l | grep -w 1"

0 commit comments

Comments
 (0)