Skip to content

Commit ecf0d21

Browse files
authored
ci: Enable launcher-populator in OpenShift E2E and local tests (#348)
* ci: Enable launcher-populator in OpenShift E2E and local tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: aavarghese <avarghese@us.ibm.com> * Fix launcher populator deployment Signed-off-by: aavarghese <avarghese@us.ibm.com> * More fixes for launcher populator created launchers to work Signed-off-by: aavarghese <avarghese@us.ibm.com> * Review comments Signed-off-by: aavarghese <avarghese@us.ibm.com> --------- Signed-off-by: aavarghese <avarghese@us.ibm.com>
1 parent 1207e11 commit ecf0d21

File tree

6 files changed

+156
-89
lines changed

6 files changed

+156
-89
lines changed

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 40 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -529,21 +529,28 @@ jobs:
529529
--set dualPodsController.sleeperLimit=2 \
530530
--set global.local=false \
531531
--set dualPodsController.debugAcceleratorMemory=false \
532-
--set launcherPopulator.enabled=false
532+
--set launcherPopulator.enabled=true
533533
534-
- name: Wait for controller to be ready
534+
- name: Wait for FMA controllers to be ready
535535
run: |
536-
echo "Waiting for FMA controller deployment to be ready..."
537536
kubectl wait --for=condition=available --timeout=120s \
538537
deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
539-
540538
echo ""
541-
echo "=== Controller Pod Status ==="
542-
kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/name=fma-controllers
539+
echo "=== Dual-Pod Controller Pod Status ==="
540+
kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=dual-pods-controller
543541
echo ""
544-
echo "=== Controller Deployment ==="
542+
echo "=== Dual-Pod Controller Deployment ==="
545543
kubectl get deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
546544
545+
kubectl wait --for=condition=available --timeout=120s \
546+
deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
547+
echo ""
548+
echo "=== Launcher Populator Pod Status ==="
549+
kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=launcher-populator
550+
echo ""
551+
echo "=== Launcher Populator Deployment ==="
552+
kubectl get deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
553+
547554
- name: Verify controller health
548555
run: |
549556
echo "Checking controller pod for issues..."
@@ -671,6 +678,19 @@ jobs:
671678
- name: XDG_CONFIG_HOME
672679
value: "/tmp"
673680
---
681+
apiVersion: fma.llm-d.ai/v1alpha1
682+
kind: LauncherPopulationPolicy
683+
metadata:
684+
name: lpp-${INST}
685+
spec:
686+
enhancedNodeSelector:
687+
labelSelector:
688+
matchLabels:
689+
nvidia.com/gpu.present: "true"
690+
countForLauncher:
691+
- launcherConfigName: launcher-config-${INST}
692+
launcherCount: 1
693+
---
674694
apiVersion: apps/v1
675695
kind: ReplicaSet
676696
metadata:
@@ -720,6 +740,7 @@ jobs:
720740
echo "instance=${INST}" >> $GITHUB_OUTPUT
721741
echo "isc=inference-server-config-${INST}" >> $GITHUB_OUTPUT
722742
echo "lc=launcher-config-${INST}" >> $GITHUB_OUTPUT
743+
echo "lpp=lpp-${INST}" >> $GITHUB_OUTPUT
723744
echo "rs=my-request-${INST}" >> $GITHUB_OUTPUT
724745
echo "Test objects created"
725746
@@ -739,8 +760,6 @@ jobs:
739760
fi
740761
if [ "$ELAPSED" -ge "$LIMIT" ]; then
741762
echo "::error::Requester pod did not appear within ${LIMIT}s"
742-
kubectl get pods -n "$FMA_NAMESPACE" -o wide
743-
kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
744763
exit 1
745764
fi
746765
sleep 5
@@ -750,22 +769,21 @@ jobs:
750769
REQUESTER=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o json | jq -r '.items[0].metadata.name')
751770
echo "Requester pod: $REQUESTER"
752771
753-
echo "Waiting for launcher pod..."
772+
# LauncherPopulationPolicy specifies launcherCount per node with nvidia.com/gpu.present=true
773+
GPU_NODES=$(kubectl get nodes -l nvidia.com/gpu.present=true --field-selector spec.unschedulable!=true -o name | wc -l | tr -d ' ')
774+
echo "Expecting launcher-populator to create $GPU_NODES launcher(s)"
775+
776+
echo "Waiting for launcher-populator to create launcher pods..."
754777
ELAPSED=0
755778
while true; do
756779
COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o json 2>/dev/null | jq '.items | length')
757-
if [ "$COUNT" -ge 1 ]; then
758-
echo "Launcher pod(s) found: $COUNT"
780+
if [ "$COUNT" -ge "$GPU_NODES" ]; then
781+
echo "Launcher-populator created $COUNT launcher(s) successfully"
782+
kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o wide
759783
break
760784
fi
761785
if [ "$ELAPSED" -ge "$LIMIT" ]; then
762-
echo "::error::Launcher pod did not appear within ${LIMIT}s"
763-
kubectl get pods -n "$FMA_NAMESPACE" -o wide
764-
kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
765-
echo "=== Controller logs ==="
766-
kubectl logs deployment/"$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE" --tail=50 || true
767-
echo "=== Requester logs ==="
768-
kubectl logs "$REQUESTER" -n "$FMA_NAMESPACE" --tail=50 || true
786+
echo "::error::Launcher-populator did not create expected $GPU_NODES launcher(s) within ${LIMIT}s (found: $COUNT)"
769787
exit 1
770788
fi
771789
sleep 5
@@ -797,9 +815,6 @@ jobs:
797815
fi
798816
if [ "$ELAPSED" -ge "$LIMIT" ]; then
799817
echo "::error::Launcher-to-requester binding not established within ${LIMIT}s"
800-
kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
801-
echo "=== Controller logs ==="
802-
kubectl logs deployment/"$FMA_RELEASE_NAME" -n "$FMA_NAMESPACE" --tail=100 || true
803818
exit 1
804819
fi
805820
sleep 5
@@ -818,9 +833,6 @@ jobs:
818833
if [ "$ELAPSED" -ge "$LIMIT" ]; then
819834
echo "::error::Requester-to-launcher binding not established within ${LIMIT}s"
820835
echo " Requester dual label: '$REQUESTER_DUAL' (expected: '$LAUNCHER')"
821-
kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
822-
echo "=== Controller logs ==="
823-
kubectl logs deployment/"$FMA_RELEASE_NAME" -n "$FMA_NAMESPACE" --tail=100 || true
824836
exit 1
825837
fi
826838
sleep 5
@@ -863,66 +875,19 @@ jobs:
863875
864876
- name: Dump vLLM instance logs from launchers
865877
if: always()
866-
run: |
867-
echo "Fetching vLLM instance logs from launcher pods..."
868-
869-
# Get all launcher pods (there may be more than one)
870-
LAUNCHER_PODS=$(kubectl get pods -n "$FMA_NAMESPACE" \
871-
-l "dual-pods.llm-d.ai/launcher-config-name" \
872-
-o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
873-
874-
if [ -z "$LAUNCHER_PODS" ]; then
875-
echo "No launcher pods found"
876-
exit 0
877-
fi
878-
879-
# Process each launcher pod
880-
for LAUNCHER_POD in $LAUNCHER_PODS; do
881-
echo ""
882-
echo "=========================================="
883-
echo "=== Launcher pod: $LAUNCHER_POD ==="
884-
echo "=========================================="
885-
886-
# Use port-forward to access launcher API from runner (avoids requiring curl in container)
887-
kubectl port-forward -n "$FMA_NAMESPACE" "pod/$LAUNCHER_POD" 18001:8001 &
888-
PF_PID=$!
889-
sleep 2
890-
891-
# Get list of vLLM instances from launcher API
892-
echo ""
893-
echo "=== vLLM instances status ==="
894-
INSTANCES_JSON=$(curl -s "http://localhost:18001/v2/vllm/instances" 2>/dev/null || true)
895-
echo "$INSTANCES_JSON" | jq . 2>/dev/null || echo "$INSTANCES_JSON"
896-
897-
# Get instance IDs using jq
898-
INSTANCE_IDS=$(echo "$INSTANCES_JSON" | jq -r '.instances[].instance_id // empty' 2>/dev/null || true)
899-
900-
if [ -z "$INSTANCE_IDS" ]; then
901-
echo "No vLLM instances found on this launcher"
902-
else
903-
# Fetch logs for each instance
904-
for id in $INSTANCE_IDS; do
905-
echo ""
906-
echo "=== vLLM instance $id logs ==="
907-
curl -s "http://localhost:18001/v2/vllm/instances/$id/log" 2>/dev/null || true
908-
echo ""
909-
done
910-
fi
911-
912-
# Clean up port-forward
913-
kill $PF_PID 2>/dev/null || true
914-
wait $PF_PID 2>/dev/null || true
915-
done
878+
run: scripts/dump-launcher-vllm-logs.sh "$FMA_NAMESPACE"
916879

917880
- name: Clean up test objects
918881
if: always()
919882
env:
920883
ISC: ${{ steps.test-objects.outputs.isc }}
921884
LC: ${{ steps.test-objects.outputs.lc }}
885+
LPP: ${{ steps.test-objects.outputs.lpp }}
922886
RS: ${{ steps.test-objects.outputs.rs }}
923887
run: |
924888
echo "Cleaning up test objects..."
925889
kubectl delete rs "$RS" -n "$FMA_NAMESPACE" --ignore-not-found || true
890+
kubectl delete launcherpopulationpolicy "$LPP" -n "$FMA_NAMESPACE" --ignore-not-found || true
926891
kubectl delete inferenceserverconfig "$ISC" -n "$FMA_NAMESPACE" --ignore-not-found || true
927892
kubectl delete launcherconfig "$LC" -n "$FMA_NAMESPACE" --ignore-not-found || true
928893
# Wait for test pods to terminate

.github/workflows/launcher-based-e2e-test.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,7 @@ jobs:
103103
echo "=== Logs for $pod ==="
104104
kubectl logs $pod || echo "Failed to get logs for $pod"
105105
done
106+
107+
- name: show vLLM instance logs from launchers
108+
if: always()
109+
run: scripts/dump-launcher-vllm-logs.sh

charts/fma-controllers/templates/launcher-populator/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
image: "{{ .Values.global.imageRegistry }}/launcher-populator:{{ .Values.global.imageTag }}"
2828
imagePullPolicy: {{ .Values.launcherPopulator.pullPolicy | default "IfNotPresent" }}
2929
command:
30-
- /launcher-populator
30+
- /ko-app/launcher-populator
3131
- --namespace={{ .Release.Namespace }}
3232
ports:
3333
- containerPort: 8080

scripts/dump-launcher-vllm-logs.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env bash
2+
# Dump vLLM instance logs from all launcher pods.
3+
#
4+
# Usage: dump-launcher-vllm-logs.sh [namespace]
5+
# namespace: Kubernetes namespace (defaults to kubectl current context)
6+
7+
set -euo pipefail
8+
9+
NS_FLAG=()
10+
if [ -n "${1:-}" ]; then
11+
NS_FLAG=(-n "$1")
12+
fi
13+
14+
echo "Fetching vLLM instance logs from launcher pods..."
15+
16+
LAUNCHER_PODS=$(kubectl get pods "${NS_FLAG[@]}" \
17+
-l "dual-pods.llm-d.ai/launcher-config-name" \
18+
-o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
19+
20+
if [ -z "$LAUNCHER_PODS" ]; then
21+
echo "No launcher pods found"
22+
exit 0
23+
fi
24+
25+
for LAUNCHER_POD in $LAUNCHER_PODS; do
26+
echo "=========================================="
27+
echo "=== Launcher pod: $LAUNCHER_POD ==="
28+
echo "=========================================="
29+
30+
kubectl port-forward "${NS_FLAG[@]}" "pod/$LAUNCHER_POD" 18001:8001 &
31+
PF_PID=$!
32+
sleep 2
33+
34+
# Get list of vLLM instances
35+
echo ""
36+
echo "=== vLLM instances status ==="
37+
INSTANCES_JSON=$(curl -s "http://localhost:18001/v2/vllm/instances" || true)
38+
echo "$INSTANCES_JSON" | jq . 2>/dev/null || echo "$INSTANCES_JSON"
39+
40+
# Get instance IDs
41+
INSTANCE_IDS=$(echo "$INSTANCES_JSON" | jq -r '.instances[].instance_id // empty' 2>/dev/null || true)
42+
43+
if [ -z "$INSTANCE_IDS" ]; then
44+
echo "No vLLM instances found on launcher: $LAUNCHER_POD"
45+
else
46+
# Fetch logs for each instance
47+
for id in $INSTANCE_IDS; do
48+
echo ""
49+
echo "=== vLLM instance $id log ==="
50+
curl -s "http://localhost:18001/v2/vllm/instances/$id/log" || true
51+
echo ""
52+
done
53+
fi
54+
55+
# Clean up
56+
kill $PF_PID 2>/dev/null || true
57+
wait $PF_PID 2>/dev/null || true
58+
done

test/e2e/mkobjs.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,21 @@ spec:
9797
valueFrom:
9898
fieldRef: { fieldPath: metadata.namespace }
9999
---
100+
apiVersion: fma.llm-d.ai/v1alpha1
101+
kind: LauncherPopulationPolicy
102+
metadata:
103+
name: lpp-$inst
104+
labels:
105+
instance: "$inst"
106+
spec:
107+
enhancedNodeSelector:
108+
labelSelector:
109+
matchLabels:
110+
nvidia.com/gpu.present: "true"
111+
countForLauncher:
112+
- launcherConfigName: launcher-config-$inst
113+
launcherCount: 1
114+
---
100115
apiVersion: apps/v1
101116
kind: ReplicaSet
102117
metadata:
@@ -163,6 +178,7 @@ then
163178
echo my-request-$inst
164179
echo inference-server-config-qwen-$inst
165180
echo inference-server-config-tinyllama-$inst
181+
echo lpp-$inst
166182
else
167183
echo Failed to create objects >&2
168184
echo "$out" >&2

0 commit comments

Comments
 (0)