@@ -71,6 +71,37 @@ expect() {
7171 done
7272}
7373
74+ # pin_gpu patches the ReplicaSet to bypass OpenShift's GPU assignment.
75+ # Sets nvidia.com/gpu limit/request to 0 and injects NVIDIA_VISIBLE_DEVICES
76+ # so subsequent pods reuse the same GPU UUID without going through the device plugin.
77+ # Uses global $assigned_gpu_uuids and $NS.
78+ # Arguments: <rs-name>
79+ pin_gpu () {
80+ local rs=" $1 "
81+ echo " Pinning GPU for ReplicaSet $rs : NVIDIA_VISIBLE_DEVICES=$assigned_gpu_uuids " >&2
82+ local patch
83+ patch=$( printf \
84+ ' {"spec":{"template":{"spec":{"containers":[{"name":"inference-server","resources":{"limits":{"nvidia.com/gpu":"0"},"requests":{"nvidia.com/gpu":"0"}},"env":[{"name":"NVIDIA_VISIBLE_DEVICES","value":"%s"}]}]}}}}' \
85+ " $assigned_gpu_uuids " )
86+ kubectl patch rs " $rs " -n " $NS " -p " $patch "
87+ }
88+
89+ # check_gpu_pin waits for the pod's accelerators annotation and verifies it
90+ # matches $assigned_gpu_uuids, ensuring the same GPU is reused after scale-up.
91+ # Uses global $assigned_gpu_uuids and $NS.
92+ # Arguments: <pod-name>
93+ check_gpu_pin () {
94+ local pod=" $1 "
95+ expect ' [ -n "$(kubectl get pod -n ' " $NS " ' ' " $pod " ' -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/accelerators})" ]'
96+ local actual_uuids
97+ actual_uuids=$( kubectl get pod " $pod " -n " $NS " -o jsonpath=' {.metadata.annotations.dual-pods\.llm-d\.ai/accelerators}' )
98+ if [ " $actual_uuids " != " $assigned_gpu_uuids " ]; then
99+ echo " ERROR: GPU UUID mismatch on pod $pod : expected=$assigned_gpu_uuids actual=$actual_uuids " >&2
100+ exit 1
101+ fi
102+ echo " GPU UUID(s) verified on pod $pod : $actual_uuids "
103+ }
104+
74105# ---------------------------------------------------------------------------
75106# Create test objects
76107# ---------------------------------------------------------------------------
@@ -132,6 +163,16 @@ date
132163kubectl wait --for condition=Ready pod/$reqlb -n " $NS " --timeout=180s
133164[ " $( kubectl get pod $launcherlb -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
134165
166+ # On OpenShift, record the GPU UUID assigned by the cluster so we can pin it later.
167+ # The controller writes the UUID(s) to the dual-pods.llm-d.ai/accelerators annotation
168+ # after querying the requester's SPI endpoint; it is guaranteed to be set by the time
169+ # the pod is Ready.
170+ if [ " $E2E_PLATFORM " = " openshift" ]; then
171+ expect ' [ -n "$(kubectl get pod -n ' " $NS " ' $reqlb -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/accelerators})" ]'
172+ assigned_gpu_uuids=$( kubectl get pod " $reqlb " -n " $NS " -o jsonpath=' {.metadata.annotations.dual-pods\.llm-d\.ai/accelerators}' )
173+ echo " Assigned GPU UUID(s) on OpenShift: $assigned_gpu_uuids "
174+ fi
175+
135176cheer Successful launcher-based pod creation
136177
137178# ---------------------------------------------------------------------------
@@ -156,13 +197,6 @@ if [ "$POLICIES_ENABLED" = true ]; then
156197 cheer CEL policy checks passed
157198fi
158199
159- # TODO: stop skipping once Issues 387 is resolved
160- if [ " $E2E_PLATFORM " = " openshift" ]; then
161- echo " Skipping the remaining test cases on OpenShift because Issue 387 is not resolved there yet" >&2
162- cheer All launcher-based tests that are currently expected to pass on OpenShift have done so
163- exit 0
164- fi
165-
166200# ---------------------------------------------------------------------------
167201# Same-Node Port Collision
168202# ---------------------------------------------------------------------------
@@ -232,6 +266,9 @@ kubectl scale rs $rslb -n "$NS" --replicas=0
232266
233267expect " kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"
234268
269+ # On OpenShift, pin the GPU so the next scale-up reuses the same GPU.
270+ if [ " $E2E_PLATFORM " = " openshift" ]; then pin_gpu $rslb ; fi
271+
235272# Patch requester ReplicaSet to stick to testnode
236273kubectl patch rs $rslb -n " $NS " -p ' {"spec": {"template": {"spec": {"nodeSelector": {"kubernetes.io/hostname": "' $testnode ' "} }} }}'
237274
262299kubectl wait --for condition=Ready pod/$reqlb2 -n " $NS " --timeout=120s
263300[ " $( kubectl get pod $launcherlb -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
264301
302+ # On OpenShift, verify the same GPU UUID was assigned after wake-up.
303+ if [ " $E2E_PLATFORM " = " openshift" ]; then check_gpu_pin $reqlb2 ; fi
304+
265305cheer Successful instance wake-up fast path
266306
267307# ---------------------------------------------------------------------------
305345kubectl wait --for condition=Ready pod/$reqlb3 -n " $NS " --timeout=120s
306346[ " $( kubectl get pod $launcherlb -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
307347
348+ if [ " $E2E_PLATFORM " = " openshift" ]; then check_gpu_pin $reqlb3 ; fi
349+
308350cheer Successful multiple instances sharing one launcher
309351
310352# ---------------------------------------------------------------------------
348390kubectl wait --for condition=Ready pod/$reqlb4 -n " $NS " --timeout=120s
349391[ " $( kubectl get pod $launcherlb -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
350392
393+ if [ " $E2E_PLATFORM " = " openshift" ]; then check_gpu_pin $reqlb4 ; fi
394+
351395cheer Successful switching instances in one launcher
352396
353397# ---------------------------------------------------------------------------
412456kubectl wait --for condition=Ready pod/$reqlb_post_restart -n " $NS " --timeout=30s
413457[ " $( kubectl get pod $launcherlb -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
414458
459+ if [ " $E2E_PLATFORM " = " openshift" ]; then check_gpu_pin $reqlb_post_restart ; fi
460+
415461cheer Successful controller restart state recovery
416462
417463# ---------------------------------------------------------------------------
449495kubectl wait --for condition=Ready pod/$reqlb_after_delete -n " $NS " --timeout=120s
450496[ " $( kubectl get pod $launcherlb_after_delete -n " $NS " -o jsonpath=' {.status.conditions[?(@.type=="Ready")].status}' ) " = " True" ]
451497
498+ if [ " $E2E_PLATFORM " = " openshift" ]; then check_gpu_pin $reqlb_after_delete ; fi
499+
452500cheer Successful unbound launcher deletion cleanup
453501
454502cheer All launcher-based tests passed
0 commit comments