Skip to content

Commit d2d43ca

Browse files
authored
Integration between the dual-pods controller and the launcher (llm-d-incubation#231)
* Begin to manage the lifecycles of vLLM instance via launchers Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Rework the process of infSvrItem Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Move some controllers-shared stuff to pkg/controller/common/ Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Revise bind and unbind logic to be compatible with launcher-based providers Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Correct the hashing of LauncherConfig Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Use Spec.ModelServerConfig.Port instead of hard-coded port for vLLM instances Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Improve error reports Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Take all matching launcher Pods into consideration Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Better handle the inference server port of launcher-hosted vLLM instances Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Initial e2e tests for CR-based requesters and launcher-based providers Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Use GitHub Actions for launcher-based e2e tests * Lengthen timeout for launcher-based e2e tests Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Update the helm command for dpctlr due to its new dir Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> * Enforce predictable output format for "docker images"; fix typo Signed-off-by: Jun Duan <jun.duan.phd@outlook.com> --------- Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
1 parent bdaaea7 commit d2d43ca

File tree

9 files changed

+861
-57
lines changed

9 files changed

+861
-57
lines changed
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Tests launcher-based server-providing pods in a `kind` cluster
2+
name: Launcher-Based E2E Test
3+
4+
on:
5+
push:
6+
paths:
7+
- ".github/workflows/launcher-based-e2e-test.yml"
8+
- Makefile
9+
- cmd/dual-pods-controller/**
10+
- cmd/test-requester/**
11+
- cmd/launcher-populator/**
12+
- inference_server/launcher/**
13+
- dockerfiles/Dockerfile.launcher.benchmark
14+
- pkg/**
15+
- test/e2e/run-launcher-based.sh
16+
- test/e2e/mkobjs.sh
17+
pull_request:
18+
branches:
19+
- main
20+
21+
jobs:
22+
debug:
23+
name: print relevant info
24+
runs-on: ubuntu-latest
25+
steps:
26+
- run: |
27+
echo "github.actor=${{ github.actor }}"
28+
echo "github.action_ref=${{ github.action_ref }}"
29+
echo "github.event_name=${{ github.event_name }}"
30+
echo "github.head_ref=$GITHUB_HEAD_REF"
31+
echo "github.ref=${{ github.ref }}"
32+
echo "github.ref_name=${{ github.ref_name }}"
33+
echo "github.repository=${{ github.repository }}"
34+
echo "github.repository_owner=${{ github.repository_owner }}"
35+
echo "github.triggering_actor=${{ github.triggering_actor }}"
36+
echo "GITHUB_ACTION_REF=$GITHUB_ACTION_REF"
37+
38+
run-launcher-test:
39+
runs-on: ubuntu-22.04-arm
40+
steps:
41+
- uses: actions/setup-go@v5
42+
with:
43+
go-version: '1.24.2'
44+
45+
- name: Install ko
46+
uses: ko-build/setup-ko@v0.8
47+
with:
48+
version: v0.15.2
49+
50+
- name: Checkout code
51+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
52+
53+
- name: Set up Docker Buildx
54+
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
55+
56+
- name: Run launcher-based E2E test
57+
run: test/e2e/run-launcher-based.sh
58+
59+
- name: show all pods
60+
if: always()
61+
run: kubectl get pods -A -o wide
62+
63+
- name: show test pods with labels
64+
if: always()
65+
run: kubectl get pods -L dual-pods.llm-d.ai/dual,dual-pods.llm-d.ai/sleeping,dual-pods.llm-d.ai/launcher-config-name
66+
67+
- name: show ReplicaSets
68+
if: always()
69+
run: kubectl get rs -A
70+
71+
- name: show dual-pods controller log
72+
if: always()
73+
run: kubectl logs deploy/dpctlr
74+
75+
- name: show launcher-populator log
76+
if: always()
77+
run: kubectl logs deploy/launcher-populator || echo "launcher-populator not deployed"
78+
79+
- name: show GPU allocations
80+
if: always()
81+
run: kubectl get cm gpu-allocs -o yaml
82+
83+
- name: show GPU map
84+
if: always()
85+
run: kubectl get cm gpu-map -o yaml
86+
87+
- name: show InferenceServerConfigs
88+
if: always()
89+
run: kubectl get inferenceserverconfigs -o yaml
90+
91+
- name: show LauncherConfigs
92+
if: always()
93+
run: kubectl get launcherconfigs -o yaml
94+
95+
- name: show YAML of test pods
96+
if: always()
97+
run: kubectl get pods -o yaml
98+
99+
- name: show launcher pod logs
100+
if: always()
101+
run: |
102+
for pod in $(kubectl get pods -l dual-pods.llm-d.ai/launcher-config-name -o name); do
103+
echo "=== Logs for $pod ==="
104+
kubectl logs $pod || echo "Failed to get logs for $pod"
105+
done

pkg/controller/common/interface.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,9 @@ const (
3434
// that is relevant to identify the launcher-based server-providing Pod, mainly the
3535
// corresponding LauncherConfig object's PodTemplate that the server-providing Pod uses.
3636
LauncherConfigHashAnnotationKey = "dual-pods.llm-d.ai/launcher-config-hash"
37+
38+
// LauncherServicePort is the port number on which the launcher exposes its HTTP service
39+
// for the management of vLLM instances.
40+
// This is a contract between the controllers and the launcher implementation.
41+
LauncherServicePort = 8001
3742
)

pkg/controller/dual-pods/controller.go

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242

4343
fmav1alpha1 "github.com/llm-d-incubation/llm-d-fast-model-actuation/api/fma/v1alpha1"
4444
"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
45+
ctlrcommon "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/common"
4546
genctlr "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/generic"
4647
"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/utils"
4748
fmainformers "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/generated/informers/externalversions"
@@ -103,7 +104,7 @@ func GPUIndexFunc(obj any) ([]string, error) {
103104
if len(pod.Annotations[nominalHashAnnotationKey]) == 0 || pod.Spec.NodeName == "" {
104105
return []string{}, nil
105106
}
106-
isIdx, _, err := utils.GetInferenceServerPort(pod)
107+
isIdx, _, err := utils.GetInferenceServerPort(pod, false)
107108
if err != nil {
108109
return []string{}, nil
109110
}
@@ -175,6 +176,7 @@ func (config ControllerConfig) NewController(
175176
ctl.gpuMap.Store(&map[string]GpuLocation{})
176177
err := ctl.podInformer.AddIndexers(cache.Indexers{
177178
inferenceServerConfigIndexName: inferenceServerConfigIndexFunc,
179+
launcherConfigHashIndexName: launcherConfigHashIndexFunc,
178180
requesterIndexName: requesterIndexFunc,
179181
nominalHashIndexName: nominalHashIndexFunc,
180182
GPUIndexName: GPUIndexFunc})
@@ -303,7 +305,6 @@ type serverData struct {
303305
RequesterDeleteRequested bool
304306
}
305307

306-
// nolint
307308
type launcherData struct {
308309
// Instances is a map,
309310
// where key is an instance's ID which is the instance' nominal hash,
@@ -313,7 +314,6 @@ type launcherData struct {
313314
// Accurate indicates whether the set of nominal hash in Instances is accurate.
314315
Accurate bool
315316
}
316-
317317
type queueItem interface {
318318
// process returns (err error, retry bool).
319319
// There will be a retry iff `retry`, error logged if `err != nil`.
@@ -371,6 +371,17 @@ func inferenceServerConfigIndexFunc(obj any) ([]string, error) {
371371
return []string{inferenceServerConfigName}, nil
372372
}
373373

374+
const launcherConfigHashIndexName = "launcherconfighash"
375+
376+
func launcherConfigHashIndexFunc(obj any) ([]string, error) {
377+
pod := obj.(*corev1.Pod)
378+
launcherConfigHash := pod.Annotations[ctlrcommon.LauncherConfigHashAnnotationKey]
379+
if len(launcherConfigHash) == 0 {
380+
return []string{}, nil
381+
}
382+
return []string{launcherConfigHash}, nil
383+
}
384+
374385
const requesterIndexName = "requester"
375386

376387
func requesterIndexFunc(obj any) ([]string, error) {
@@ -627,6 +638,7 @@ func (ctl *controller) getNodeData(nodeName string) *nodeData {
627638
ans = &nodeData{
628639
Items: sets.New[itemOnNode](),
629640
InferenceServers: make(map[apitypes.UID]*serverData),
641+
Launchers: make(map[string]*launcherData),
630642
}
631643
ctl.nodeNameToData[nodeName] = ans
632644
}
@@ -660,6 +672,19 @@ func (ctl *controller) getServerData(nodeDat *nodeData, reqName string, reqUID a
660672
return ans
661673
}
662674

675+
func (ctl *controller) getLauncherData(nodeDat *nodeData, launcherPodName string) *launcherData {
676+
ctl.mutex.Lock()
677+
defer ctl.mutex.Unlock()
678+
ans := nodeDat.Launchers[launcherPodName]
679+
if ans == nil {
680+
ans = &launcherData{
681+
Instances: make(map[string]time.Time),
682+
}
683+
nodeDat.Launchers[launcherPodName] = ans
684+
}
685+
return ans
686+
}
687+
663688
func (ctl *controller) clearServerData(nodeDat *nodeData, uid apitypes.UID) {
664689
ctl.mutex.Lock()
665690
defer ctl.mutex.Unlock()

0 commit comments

Comments
 (0)