llm-d-fast-model-actuation/test/e2e/run-launcher-based.sh at 85f29061af477e227d1598bdd53ed51e5d1ef312 · waltforme/llm-d-fast-model-actuation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
#!/usr/bin/env bash

# Usage: $0
# Current working directory must be the root of the Git repository.
# This script tests launcher-based server-providing pods independently.

set -euo pipefail

set -x

green=$'\033[0;32m'
nocolor=$'\033[0m'
nl=$'\n'

function cheer() {
    echo
    echo "${nl}${green}✔${nocolor} $*"
    echo
}

function expect() {
    local elapsed=0
    local start=$(date)
    local limit=${LIMIT:-600}
    while true; do
        kubectl get pods -L dual-pods.llm-d.ai/dual,dual-pods.llm-d.ai/sleeping
        if eval "$1"; then return; fi
        if (( elapsed > limit )); then
            echo "Did not become true (from $start to $(date)): $1" >&2
            exit 99
        fi
        sleep 5
        elapsed=$(( elapsed+5 ))
    done
}

function clear_img_repo() (
    set +o pipefail
    docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.CreatedAt}}" $1 | fgrep -v '<none>' | grep -vw REPOSITORY | while read name tag rest; do
	docker rmi $name:$tag
    done
)

: Build the container images, no push

clear_img_repo ko.local/test-requester
clear_img_repo my-registry/my-namespace/test-requester
clear_img_repo my-registry/my-namespace/test-launcher
clear_img_repo ko.local/dual-pods-controller
clear_img_repo my-registry/my-namespace/dual-pods-controller
clear_img_repo ko.local/launcher-populator
clear_img_repo my-registry/my-namespace/launcher-populator
make build-test-requester-local
make build-test-launcher-local
make build-controller-local
make build-populator-local

: Set up the kind cluster

kind delete cluster --name fmatest
kind create cluster --name fmatest --config - <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
EOF

kubectl wait --for=create sa default
kubectl wait --for condition=Ready node fmatest-control-plane
kubectl wait --for condition=Ready node fmatest-worker

# Display health, prove we don't have https://kind.sigs.k8s.io/docs/user/known-issues/#pod-errors-due-to-too-many-open-files
kubectl get pods -A -o wide

kubectl create clusterrole node-viewer --verb=get,list,watch --resource=nodes

kubectl create -f ./config/crd/

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: testreq
rules:
- apiGroups:
  - "fma.llm-d.ai"
  resources:
  - inferenceserverconfigs
  - launcherconfigs
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resourceNames:
  - gpu-map
  - gpu-allocs
  resources:
  - configmaps
  verbs:
  - update
  - patch
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  verbs:
  - create
- apiGroups:
  - ""
  resources:
  - pods
  verbs:
  - get
  - list
  - watch
EOF

kubectl create rolebinding testreq --role=testreq --serviceaccount=$(kubectl get sa default -o jsonpath={.metadata.namespace}):testreq

kubectl create sa testreq

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: testlauncher
rules:
- apiGroups:
  - ""
  resourceNames:
  - gpu-map
  resources:
  - configmaps
  verbs:
  - get
  - list
  - watch
EOF

kubectl create rolebinding testlauncher --role=testlauncher --serviceaccount=$(kubectl get sa default -o jsonpath={.metadata.namespace}):testlauncher

kubectl create sa testlauncher
kubectl create cm gpu-map
kubectl get nodes -o name | sed 's%^node/%%' | while read node; do
    kubectl label node $node nvidia.com/gpu.present=true nvidia.com/gpu.product=NVIDIA-L40S nvidia.com/gpu.count=2 --overwrite=true
    kubectl patch node $node --subresource status -p '{"status": {"capacity": {"nvidia.com/gpu": 2}, "allocatable": {"nvidia.com/gpu": 2} }}'
done

: Load the container images into the kind cluster

make load-test-requester-local
make load-test-launcher-local
make load-controller-local
make load-populator-local

: Detect whether API server supports ValidatingAdmissionPolicy

POLICIES_ENABLED=false
if kubectl api-resources --api-group=admissionregistration.k8s.io -o name | grep -q 'validatingadmissionpolicies'; then
  POLICIES_ENABLED=true
  kubectl apply -f config/validating-admission-policies
fi

: Deploy the FMA controllers in the cluster

img_reg=$(make echo-var VAR=CONTAINER_IMG_REG)
img_tag=$(make echo-var VAR=IMAGE_TAG)

helm upgrade --install fma charts/fma-controllers \
  --set global.imageRegistry="$img_reg" \
  --set global.imageTag="$img_tag" \
  --set global.nodeViewClusterRole=node-viewer \
  --set dualPodsController.sleeperLimit=2 \
  --set global.local=true \
  --set dualPodsController.debugAcceleratorMemory=false \
  --set launcherPopulator.enabled=true

: Populate GPU map for testing

gi=0
kubectl get nodes -o name | sed 's%^node/%%' | while read node; do
    let gi1=gi+1
    kubectl patch cm gpu-map -p "data:${nl} ${node}: '{\"GPU-$gi\": 0, \"GPU-$gi1\": 1 }'"
    let gi=gi1+1
done

: Wait for FMA controllers to be ready

kubectl wait --for=condition=available deployment/fma-dual-pods-controller --timeout=120s
kubectl get pods -l app.kubernetes.io/component=dual-pods-controller

kubectl wait --for=condition=available deployment/fma-launcher-populator --timeout=120s
kubectl get pods -l app.kubernetes.io/component=launcher-populator

: Test launcher-based server-providing pods

: Basic Launcher Pod Creation

objs=$(test/e2e/mkobjs.sh)
isc=$(echo $objs | awk '{print $1}')
lc=$(echo $objs | awk '{print $2}')
rslb=$(echo $objs | awk '{print $3}')
isc2=$(echo $objs | awk '{print $4}')
lpp=$(echo $objs | awk '{print $5}')
instlb=${rslb#my-request-}

# LauncherPopulationPolicy specifies launcherCount per node with nvidia.com/gpu.present=true
GPU_NODES=$(kubectl get nodes -l nvidia.com/gpu.present=true --field-selector spec.unschedulable!=true -o name | wc -l | tr -d ' ')
echo "Expecting launcher-populator to create $GPU_NODES launcher(s) (one per schedulable GPU node)"
expect "[ \$(kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | wc -l | tr -d ' ') -ge $GPU_NODES ]"
echo "Launcher-populator created launchers successfully"
kubectl get pods -l dual-pods.llm-d.ai/launcher-config-name=$lc

# Expect requester pod to be created
expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"

export reqlb=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)

# Expect launcher pod to be created (not a direct provider)
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb | wc -l | grep -w 1"

export launcherlb=$(kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb | sed s%pod/%%)

# Verify requester is bound to launcher
expect '[ "$(kubectl get pod $reqlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'

# Verify launcher is bound to requester
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb" ]'

# Wait for both pods to be ready (vLLM on CPU takes ~90s to start)
date
kubectl wait --for condition=Ready pod/$reqlb --timeout=180s
[ "$(kubectl get pod $launcherlb -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful launcher-based pod creation

: Test CEL policy verification if enabled

if [ "${POLICIES_ENABLED}" = true ]; then
  if ! test/e2e/validate.sh; then
    echo "ERROR: CEL policy tests failed!" >&2
    exit 1
  fi
  cheer CEL policy checks passed
fi

: Instance Wake-up Fast Path

# Scale requester to 0 (instance should sleep in launcher)
kubectl scale rs $rslb --replicas=0

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"

# Launcher should remain
kubectl get pod $launcherlb

# Verify launcher is unbound (no dual label pointing to requester)
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'

# Scale back up (should reuse same launcher and wake sleeping instance)
kubectl scale rs $rslb --replicas=1

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"

reqlb2=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)

# Should still be using the same launcher pod
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb2 | wc -l | grep -w 1"
launcherlb2=$(kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb2 | sed s%pod/%%)
[ "$launcherlb2" == "$launcherlb" ]

# Verify new requester is bound to same launcher
expect '[ "$(kubectl get pod $reqlb2 -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'

# Verify launcher is bound to new requester
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb2" ]'

# Wait for requester to be ready (launcher should already be ready)
date
kubectl wait --for condition=Ready pod/$reqlb2 --timeout=120s
[ "$(kubectl get pod $launcherlb -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful instance wake-up fast path

: Multiple Instances Share One Launcher

# Scale requester to 0 again
kubectl scale rs $rslb --replicas=0

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"

# Launcher should remain
kubectl get pod $launcherlb

# Verify launcher is unbound
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'

# Patch ReplicaSet to use isc2 instead of isc
kubectl patch rs $rslb -p='{"spec":{"template":{"metadata":{"annotations":{"dual-pods.llm-d.ai/inference-server-config":"'$isc2'"}}}}}'

# Scale back up (should reuse same launcher and create 2nd instance)
kubectl scale rs $rslb --replicas=1

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"

reqlb3=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)

# Should still be using the same launcher pod
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb3 | wc -l | grep -w 1"
launcherlb3=$(kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb3 | sed s%pod/%%)
[ "$launcherlb3" == "$launcherlb" ]

# Verify new requester is bound to same launcher
expect '[ "$(kubectl get pod $reqlb3 -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'

# Verify launcher is bound to new requester
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb3" ]'

# Wait for requester to be ready (launcher should already be ready)
date
kubectl wait --for condition=Ready pod/$reqlb3 --timeout=120s
[ "$(kubectl get pod $launcherlb -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful multiple instances sharing one launcher

: Switch Instances In One Launcher

# Scale requester to 0 again
kubectl scale rs $rslb --replicas=0

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"

# Launcher should remain
kubectl get pod $launcherlb

# Verify launcher is unbound
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'

# Patch ReplicaSet back to use original isc
kubectl patch rs $rslb -p='{"spec":{"template":{"metadata":{"annotations":{"dual-pods.llm-d.ai/inference-server-config":"'$isc'"}}}}}'

# Scale back up (should reuse same launcher and wake first instance)
kubectl scale rs $rslb --replicas=1

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"

reqlb4=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)

# Should still be using the same launcher pod
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb4 | wc -l | grep -w 1"
launcherlb4=$(kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb4 | sed s%pod/%%)
[ "$launcherlb4" == "$launcherlb" ]

# Verify new requester is bound to same launcher
expect '[ "$(kubectl get pod $reqlb4 -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'

# Verify launcher is bound to new requester
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb4" ]'

# Wait for requester to be ready (launcher should already be ready)
date
kubectl wait --for condition=Ready pod/$reqlb4 --timeout=120s
[ "$(kubectl get pod $launcherlb -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful switching instances in one launcher

: Controller Restart State Recovery

# This test verifies that the controller can rebuild its internal state after restart
# by syncing launcher instances from unbound launcher pods

# Scale requester to 0 to create sleeping instances
kubectl scale rs $rslb --replicas=0

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"

# Verify launcher set is unchanged and target launcher is unbound
launcher_count_pre_restart=$(kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | wc -l)
kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | grep -x "pod/$launcherlb"
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'

# Verify launcher has sleeping instances before restart
launcher_instances_before=$(kubectl exec $launcherlb -- python3 -c 'import json,urllib.request; print(json.load(urllib.request.urlopen("http://127.0.0.1:8001/v2/vllm/instances"))["total_instances"])')
echo "Launcher has $launcher_instances_before instances before controller restart"
[ "$launcher_instances_before" -gt "0" ]

# Restart the dual-pods controller to test state recovery
echo "Restarting dual-pods controller..."
kubectl rollout restart deployment fma-dual-pods-controller
kubectl rollout status deployment fma-dual-pods-controller --timeout=60s

# Wait for controller to be ready for ongoing checks
# In detail: allow some time for the dual-pods controller to do something unexpected in the case that the controller is behaving incorrectly,
# so that the ongoing checks have some chance to fail thus detect the incorrectness, instead of just quickly and coincidentally passing.
sleep 30

# Verify launcher pod set size is unchanged and target launcher is still running
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | wc -l | grep -w $launcher_count_pre_restart"
kubectl get pods -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | grep -x "pod/$launcherlb"

# Verify launcher still has the same number of instances after controller restart
launcher_instances_after=$(kubectl exec $launcherlb -- python3 -c 'import json,urllib.request; print(json.load(urllib.request.urlopen("http://127.0.0.1:8001/v2/vllm/instances"))["total_instances"])')
echo "Launcher has $launcher_instances_after instances after controller restart"
[ "$launcher_instances_after" == "$launcher_instances_before" ]

# Now scale up requester - controller should correctly select the launcher with sleeping instance
# Use isc2 which should have a sleeping instance from before
kubectl patch rs $rslb --type=json -p='[{"op": "replace", "path": "/spec/template/metadata/annotations/dual-pods.llm-d.ai~1inference-server-config", "value": "'$isc2'"}]'
kubectl scale rs $rslb --replicas=1

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
reqlb_post_restart=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)

# Verify requester is bound to the same launcher (controller recovered state correctly)
expect '[ "$(kubectl get pod $reqlb_post_restart -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$reqlb_post_restart" ]'

# Verify requester becomes ready (fast wake-up path should work)
date
kubectl wait --for condition=Ready pod/$reqlb_post_restart --timeout=30s
[ "$(kubectl get pod $launcherlb -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful controller restart state recovery

: Unbound Launcher Deletion Cleanup

# This test verifies that deleting an unbound launcher does not leave the controller
# stuck with stale instance state.

kubectl scale rs $rslb --replicas=0

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 0"
expect '[ "$(kubectl get pod $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'

kubectl delete pod $launcherlb --wait=true

! kubectl get pod $launcherlb

kubectl scale rs $rslb --replicas=1

expect "kubectl get pods -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
reqlb_after_delete=$(kubectl get pods -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
expect "kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb_after_delete | wc -l | grep -w 1"
launcherlb_after_delete=$(kubectl get pods -o name -l dual-pods.llm-d.ai/dual=$reqlb_after_delete | sed s%pod/%%)
[ "$launcherlb_after_delete" != "$launcherlb" ]
expect '[ "$(kubectl get pod $reqlb_after_delete -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb_after_delete" ]'

date
kubectl wait --for condition=Ready pod/$reqlb_after_delete --timeout=120s
[ "$(kubectl get pod $launcherlb_after_delete -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')" = "True" ]

cheer Successful unbound launcher deletion cleanup

: Clean up launcher-based workloads

kubectl scale rs $rslb --replicas=0
expect '[ $(kubectl get pods -o name | grep "^pod/my-request-" | wc -l) == "0" ]'

cheer All launcher-based tests passed