Skip to content

Commit 9c39c87

Browse files
authored
Controll the GPU assignment for e2e test on OpenShift (#403)
* Controll the GPU assignment for e2e test on OpenShift Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Typo fix Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> --------- Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
1 parent 77f97d2 commit 9c39c87

2 files changed

Lines changed: 58 additions & 8 deletions

File tree

pkg/api/interface.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,9 @@ type ProviderData struct {
9090
// AcceleratorsAnnotationName is the name of an annotation that the dual-pods controller
9191
// maintains on both server-requesting and server-providing Pods.
9292
// This annotation is purely FYI emitted by the dual-pods controller
93-
// (it does not rely on this label for anything).
93+
// (it does not rely on this annotation for anything).
94+
// External consumers may read it for their own purposes; for example,
95+
// the launcher-based e2e test reads it to pin the GPU UUID when running on OpenShift.
9496
const AcceleratorsAnnotationName string = "dual-pods.llm-d.ai/accelerators"
9597

9698
// LauncherBasedAnnotationName is the name of an annotation that indicates that

test/e2e/test-cases.sh

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,37 @@ expect() {
7171
done
7272
}
7373

74+
# pin_gpu patches the ReplicaSet to bypass OpenShift's GPU assignment.
75+
# Sets nvidia.com/gpu limit/request to 0 and injects NVIDIA_VISIBLE_DEVICES
76+
# so subsequent pods reuse the same GPU UUID without going through the device plugin.
77+
# Uses global $assigned_gpu_uuids and $NS.
78+
# Arguments: <rs-name>
79+
pin_gpu() {
80+
local rs="$1"
81+
echo "Pinning GPU for ReplicaSet $rs: NVIDIA_VISIBLE_DEVICES=$assigned_gpu_uuids" >&2
82+
local patch
83+
patch=$(printf \
84+
'{"spec":{"template":{"spec":{"containers":[{"name":"inference-server","resources":{"limits":{"nvidia.com/gpu":"0"},"requests":{"nvidia.com/gpu":"0"}},"env":[{"name":"NVIDIA_VISIBLE_DEVICES","value":"%s"}]}]}}}}' \
85+
"$assigned_gpu_uuids")
86+
kubectl patch rs "$rs" -n "$NS" -p "$patch"
87+
}
88+
89+
# check_gpu_pin waits for the pod's accelerators annotation and verifies it
90+
# matches $assigned_gpu_uuids, ensuring the same GPU is reused after scale-up.
91+
# Uses global $assigned_gpu_uuids and $NS.
92+
# Arguments: <pod-name>
93+
check_gpu_pin() {
94+
local pod="$1"
95+
expect '[ -n "$(kubectl get pod -n '"$NS"' '"$pod"' -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/accelerators})" ]'
96+
local actual_uuids
97+
actual_uuids=$(kubectl get pod "$pod" -n "$NS" -o jsonpath='{.metadata.annotations.dual-pods\.llm-d\.ai/accelerators}')
98+
if [ "$actual_uuids" != "$assigned_gpu_uuids" ]; then
99+
echo "ERROR: GPU UUID mismatch on pod $pod: expected=$assigned_gpu_uuids actual=$actual_uuids" >&2
100+
exit 1
101+
fi
102+
echo "GPU UUID(s) verified on pod $pod: $actual_uuids"
103+
}
104+
74105
# ---------------------------------------------------------------------------
75106
# Create test objects
76107
# ---------------------------------------------------------------------------
@@ -132,6 +163,16 @@ date
132163
kubectl wait --for condition=Ready pod/$reqlb -n "$NS" --timeout=180s
133164
[ "$(kubectl get pod $launcherlb -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
134165

166+
# On OpenShift, record the GPU UUID assigned by the cluster so we can pin it later.
167+
# The controller writes the UUID(s) to the dual-pods.llm-d.ai/accelerators annotation
168+
# after querying the requester's SPI endpoint; it is guaranteed to be set by the time
169+
# the pod is Ready.
170+
if [ "$E2E_PLATFORM" = "openshift" ]; then
171+
expect '[ -n "$(kubectl get pod -n '"$NS"' $reqlb -o jsonpath={.metadata.annotations.dual-pods\\.llm-d\\.ai/accelerators})" ]'
172+
assigned_gpu_uuids=$(kubectl get pod "$reqlb" -n "$NS" -o jsonpath='{.metadata.annotations.dual-pods\.llm-d\.ai/accelerators}')
173+
echo "Assigned GPU UUID(s) on OpenShift: $assigned_gpu_uuids"
174+
fi
175+
135176
cheer Successful launcher-based pod creation
136177

137178
# ---------------------------------------------------------------------------
@@ -156,13 +197,6 @@ if [ "$POLICIES_ENABLED" = true ]; then
156197
cheer CEL policy checks passed
157198
fi
158199

159-
# TODO: stop skipping once Issues 387 is resolved
160-
if [ "$E2E_PLATFORM" = "openshift" ]; then
161-
echo "Skipping the remaining test cases on OpenShift because Issue 387 is not resolved there yet" >&2
162-
cheer All launcher-based tests that are currently expected to pass on OpenShift have done so
163-
exit 0
164-
fi
165-
166200
# ---------------------------------------------------------------------------
167201
# Same-Node Port Collision
168202
# ---------------------------------------------------------------------------
@@ -232,6 +266,9 @@ kubectl scale rs $rslb -n "$NS" --replicas=0
232266

233267
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"
234268

269+
# On OpenShift, pin the GPU so the next scale-up reuses the same GPU.
270+
if [ "$E2E_PLATFORM" = "openshift" ]; then pin_gpu $rslb; fi
271+
235272
# Patch requester ReplicaSet to stick to testnode
236273
kubectl patch rs $rslb -n "$NS" -p '{"spec": {"template": {"spec": {"nodeSelector": {"kubernetes.io/hostname": "'$testnode'"} }} }}'
237274

@@ -262,6 +299,9 @@ date
262299
kubectl wait --for condition=Ready pod/$reqlb2 -n "$NS" --timeout=120s
263300
[ "$(kubectl get pod $launcherlb -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
264301

302+
# On OpenShift, verify the same GPU UUID was assigned after wake-up.
303+
if [ "$E2E_PLATFORM" = "openshift" ]; then check_gpu_pin $reqlb2; fi
304+
265305
cheer Successful instance wake-up fast path
266306

267307
# ---------------------------------------------------------------------------
@@ -305,6 +345,8 @@ date
305345
kubectl wait --for condition=Ready pod/$reqlb3 -n "$NS" --timeout=120s
306346
[ "$(kubectl get pod $launcherlb -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
307347

348+
if [ "$E2E_PLATFORM" = "openshift" ]; then check_gpu_pin $reqlb3; fi
349+
308350
cheer Successful multiple instances sharing one launcher
309351

310352
# ---------------------------------------------------------------------------
@@ -348,6 +390,8 @@ date
348390
kubectl wait --for condition=Ready pod/$reqlb4 -n "$NS" --timeout=120s
349391
[ "$(kubectl get pod $launcherlb -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
350392

393+
if [ "$E2E_PLATFORM" = "openshift" ]; then check_gpu_pin $reqlb4; fi
394+
351395
cheer Successful switching instances in one launcher
352396

353397
# ---------------------------------------------------------------------------
@@ -412,6 +456,8 @@ date
412456
kubectl wait --for condition=Ready pod/$reqlb_post_restart -n "$NS" --timeout=30s
413457
[ "$(kubectl get pod $launcherlb -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
414458

459+
if [ "$E2E_PLATFORM" = "openshift" ]; then check_gpu_pin $reqlb_post_restart; fi
460+
415461
cheer Successful controller restart state recovery
416462

417463
# ---------------------------------------------------------------------------
@@ -449,6 +495,8 @@ date
449495
kubectl wait --for condition=Ready pod/$reqlb_after_delete -n "$NS" --timeout=120s
450496
[ "$(kubectl get pod $launcherlb_after_delete -n "$NS" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]
451497

498+
if [ "$E2E_PLATFORM" = "openshift" ]; then check_gpu_pin $reqlb_after_delete; fi
499+
452500
cheer Successful unbound launcher deletion cleanup
453501

454502
cheer All launcher-based tests passed

0 commit comments

Comments
 (0)