@@ -529,21 +529,28 @@ jobs:
529529 --set dualPodsController.sleeperLimit=2 \
530530 --set global.local=false \
531531 --set dualPodsController.debugAcceleratorMemory=false \
532- --set launcherPopulator.enabled=false
532+ --set launcherPopulator.enabled=true
533533
534- - name : Wait for controller to be ready
534+ - name : Wait for FMA controllers to be ready
535535 run : |
536- echo "Waiting for FMA controller deployment to be ready..."
537536 kubectl wait --for=condition=available --timeout=120s \
538537 deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
539-
540538 echo ""
541- echo "=== Controller Pod Status ==="
542- kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/name=fma-controllers
539+ echo "=== Dual-Pod Controller Pod Status ==="
540+ kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=dual-pods-controller
543541 echo ""
544- echo "=== Controller Deployment ==="
542+ echo "=== Dual-Pod Controller Deployment ==="
545543 kubectl get deployment "$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE"
546544
545+ kubectl wait --for=condition=available --timeout=120s \
546+ deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
547+ echo ""
548+ echo "=== Launcher Populator Pod Status ==="
549+ kubectl get pods -n "$FMA_NAMESPACE" -l app.kubernetes.io/component=launcher-populator
550+ echo ""
551+ echo "=== Launcher Populator Deployment ==="
552+ kubectl get deployment "$FMA_RELEASE_NAME-launcher-populator" -n "$FMA_NAMESPACE"
553+
547554 - name : Verify controller health
548555 run : |
549556 echo "Checking controller pod for issues..."
@@ -671,6 +678,19 @@ jobs:
671678 - name: XDG_CONFIG_HOME
672679 value: "/tmp"
673680 ---
681+ apiVersion: fma.llm-d.ai/v1alpha1
682+ kind: LauncherPopulationPolicy
683+ metadata:
684+ name: lpp-${INST}
685+ spec:
686+ enhancedNodeSelector:
687+ labelSelector:
688+ matchLabels:
689+ nvidia.com/gpu.present: "true"
690+ countForLauncher:
691+ - launcherConfigName: launcher-config-${INST}
692+ launcherCount: 1
693+ ---
674694 apiVersion: apps/v1
675695 kind: ReplicaSet
676696 metadata:
@@ -720,6 +740,7 @@ jobs:
720740 echo "instance=${INST}" >> $GITHUB_OUTPUT
721741 echo "isc=inference-server-config-${INST}" >> $GITHUB_OUTPUT
722742 echo "lc=launcher-config-${INST}" >> $GITHUB_OUTPUT
743+ echo "lpp=lpp-${INST}" >> $GITHUB_OUTPUT
723744 echo "rs=my-request-${INST}" >> $GITHUB_OUTPUT
724745 echo "Test objects created"
725746
@@ -739,8 +760,6 @@ jobs:
739760 fi
740761 if [ "$ELAPSED" -ge "$LIMIT" ]; then
741762 echo "::error::Requester pod did not appear within ${LIMIT}s"
742- kubectl get pods -n "$FMA_NAMESPACE" -o wide
743- kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
744763 exit 1
745764 fi
746765 sleep 5
@@ -750,22 +769,21 @@ jobs:
750769 REQUESTER=$(kubectl get pods -n "$FMA_NAMESPACE" -l "app=dp-example,instance=$INST" -o json | jq -r '.items[0].metadata.name')
751770 echo "Requester pod: $REQUESTER"
752771
753- echo "Waiting for launcher pod..."
772+ # LauncherPopulationPolicy specifies launcherCount per node with nvidia.com/gpu.present=true
773+ GPU_NODES=$(kubectl get nodes -l nvidia.com/gpu.present=true --field-selector spec.unschedulable!=true -o name | wc -l | tr -d ' ')
774+ echo "Expecting launcher-populator to create $GPU_NODES launcher(s)"
775+
776+ echo "Waiting for launcher-populator to create launcher pods..."
754777 ELAPSED=0
755778 while true; do
756779 COUNT=$(kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o json 2>/dev/null | jq '.items | length')
757- if [ "$COUNT" -ge 1 ]; then
758- echo "Launcher pod(s) found: $COUNT"
780+ if [ "$COUNT" -ge "$GPU_NODES" ]; then
781+ echo "Launcher-populator created $COUNT launcher(s) successfully"
782+ kubectl get pods -n "$FMA_NAMESPACE" -l "dual-pods.llm-d.ai/launcher-config-name=$LC" -o wide
759783 break
760784 fi
761785 if [ "$ELAPSED" -ge "$LIMIT" ]; then
762- echo "::error::Launcher pod did not appear within ${LIMIT}s"
763- kubectl get pods -n "$FMA_NAMESPACE" -o wide
764- kubectl get events -n "$FMA_NAMESPACE" --sort-by='.lastTimestamp' | tail -20
765- echo "=== Controller logs ==="
766- kubectl logs deployment/"$FMA_RELEASE_NAME-dual-pods-controller" -n "$FMA_NAMESPACE" --tail=50 || true
767- echo "=== Requester logs ==="
768- kubectl logs "$REQUESTER" -n "$FMA_NAMESPACE" --tail=50 || true
786+ echo "::error::Launcher-populator did not create expected $GPU_NODES launcher(s) within ${LIMIT}s (found: $COUNT)"
769787 exit 1
770788 fi
771789 sleep 5
@@ -797,9 +815,6 @@ jobs:
797815 fi
798816 if [ "$ELAPSED" -ge "$LIMIT" ]; then
799817 echo "::error::Launcher-to-requester binding not established within ${LIMIT}s"
800- kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
801- echo "=== Controller logs ==="
802- kubectl logs deployment/"$FMA_RELEASE_NAME" -n "$FMA_NAMESPACE" --tail=100 || true
803818 exit 1
804819 fi
805820 sleep 5
@@ -818,9 +833,6 @@ jobs:
818833 if [ "$ELAPSED" -ge "$LIMIT" ]; then
819834 echo "::error::Requester-to-launcher binding not established within ${LIMIT}s"
820835 echo " Requester dual label: '$REQUESTER_DUAL' (expected: '$LAUNCHER')"
821- kubectl get pods -n "$FMA_NAMESPACE" -o wide --show-labels
822- echo "=== Controller logs ==="
823- kubectl logs deployment/"$FMA_RELEASE_NAME" -n "$FMA_NAMESPACE" --tail=100 || true
824836 exit 1
825837 fi
826838 sleep 5
@@ -863,66 +875,19 @@ jobs:
863875
864876 - name : Dump vLLM instance logs from launchers
865877 if : always()
866- run : |
867- echo "Fetching vLLM instance logs from launcher pods..."
868-
869- # Get all launcher pods (there may be more than one)
870- LAUNCHER_PODS=$(kubectl get pods -n "$FMA_NAMESPACE" \
871- -l "dual-pods.llm-d.ai/launcher-config-name" \
872- -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
873-
874- if [ -z "$LAUNCHER_PODS" ]; then
875- echo "No launcher pods found"
876- exit 0
877- fi
878-
879- # Process each launcher pod
880- for LAUNCHER_POD in $LAUNCHER_PODS; do
881- echo ""
882- echo "=========================================="
883- echo "=== Launcher pod: $LAUNCHER_POD ==="
884- echo "=========================================="
885-
886- # Use port-forward to access launcher API from runner (avoids requiring curl in container)
887- kubectl port-forward -n "$FMA_NAMESPACE" "pod/$LAUNCHER_POD" 18001:8001 &
888- PF_PID=$!
889- sleep 2
890-
891- # Get list of vLLM instances from launcher API
892- echo ""
893- echo "=== vLLM instances status ==="
894- INSTANCES_JSON=$(curl -s "http://localhost:18001/v2/vllm/instances" 2>/dev/null || true)
895- echo "$INSTANCES_JSON" | jq . 2>/dev/null || echo "$INSTANCES_JSON"
896-
897- # Get instance IDs using jq
898- INSTANCE_IDS=$(echo "$INSTANCES_JSON" | jq -r '.instances[].instance_id // empty' 2>/dev/null || true)
899-
900- if [ -z "$INSTANCE_IDS" ]; then
901- echo "No vLLM instances found on this launcher"
902- else
903- # Fetch logs for each instance
904- for id in $INSTANCE_IDS; do
905- echo ""
906- echo "=== vLLM instance $id logs ==="
907- curl -s "http://localhost:18001/v2/vllm/instances/$id/log" 2>/dev/null || true
908- echo ""
909- done
910- fi
911-
912- # Clean up port-forward
913- kill $PF_PID 2>/dev/null || true
914- wait $PF_PID 2>/dev/null || true
915- done
878+ run : scripts/dump-launcher-vllm-logs.sh "$FMA_NAMESPACE"
916879
917880 - name : Clean up test objects
918881 if : always()
919882 env :
920883 ISC : ${{ steps.test-objects.outputs.isc }}
921884 LC : ${{ steps.test-objects.outputs.lc }}
885+ LPP : ${{ steps.test-objects.outputs.lpp }}
922886 RS : ${{ steps.test-objects.outputs.rs }}
923887 run : |
924888 echo "Cleaning up test objects..."
925889 kubectl delete rs "$RS" -n "$FMA_NAMESPACE" --ignore-not-found || true
890+ kubectl delete launcherpopulationpolicy "$LPP" -n "$FMA_NAMESPACE" --ignore-not-found || true
926891 kubectl delete inferenceserverconfig "$ISC" -n "$FMA_NAMESPACE" --ignore-not-found || true
927892 kubectl delete launcherconfig "$LC" -n "$FMA_NAMESPACE" --ignore-not-found || true
928893 # Wait for test pods to terminate
0 commit comments