Skip to content

Commit 90dcc59

Browse files
authored
feat: Add vLLM Data Parallel support to llm-d-inference-scheduler (#392)
* Added definition of header for Data Parallel support Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Added plugin for Data Parallel support without P/D Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Refactored the sidecar and added Data Parallel support Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Updates to sidecar tests due to refactoring Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * For local kind tests load locally built sidecar Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Added Data Parallel support to kind based tests Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Re-build all local images when testing under kind Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Review changes Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * lint fixes Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Added the sidecar to the non-pd deployment Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Removed debug logging Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Don't use PD deployment, just because of DP Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * renamed plugin parameter as per review Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Removed command line argument with default value Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Removed test of removed deprecated code Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Removed definition of removed deprecated header Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * image-build now builds the side-car image as well Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * Updates to tests Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> * image-build now builds the side-car image as well Signed-off-by: Shmuel Kallner <kallner@il.ibm.com> --------- Signed-off-by: Shmuel Kallner <kallner@il.ibm.com>
1 parent 221e500 commit 90dcc59

File tree

25 files changed

+670
-264
lines changed

25 files changed

+670
-264
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ test-integration: download-tokenizer install-dependencies ## Run integration tes
112112
go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/
113113

114114
.PHONY: test-e2e
115-
test-e2e: image-build sidecar-image-build ## Run end-to-end tests against a new kind cluster
115+
test-e2e: image-build ## Run end-to-end tests against a new kind cluster
116116
@printf "\033[33;1m==== Running End to End Tests ====\033[0m\n"
117117
./test/scripts/run_e2e.sh
118118

cmd/pd-sidecar/main.go

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ limitations under the License.
1616
package main
1717

1818
import (
19+
"crypto/tls"
1920
"flag"
2021
"net/url"
2122
"os"
@@ -30,6 +31,7 @@ import (
3031
func main() {
3132
port := flag.String("port", "8000", "the port the sidecar is listening on")
3233
vLLMPort := flag.String("vllm-port", "8001", "the port vLLM is listening on")
34+
vLLMDataParallelSize := flag.Int("data-parallel-size", 1, "the vLLM DATA-PARALLEL-SIZE value")
3335
connector := flag.String("connector", "nixlv2", "the P/D connector being used. Either nixl, nixlv2 or lmcache")
3436
prefillerUseTLS := flag.Bool("prefiller-use-tls", false, "whether to use TLS when sending requests to prefillers")
3537
decoderUseTLS := flag.Bool("decoder-use-tls", false, "whether to use TLS when sending requests to the decoder")
@@ -86,23 +88,39 @@ func main() {
8688
return
8789
}
8890

91+
var cert *tls.Certificate
92+
if *secureProxy {
93+
var tempCert tls.Certificate
94+
if *certPath != "" {
95+
tempCert, err = tls.LoadX509KeyPair(*certPath+"/tls.crt", *certPath+"/tls.key")
96+
} else {
97+
tempCert, err = proxy.CreateSelfSignedTLSCertificate()
98+
}
99+
if err != nil {
100+
logger.Error(err, "failed to create TLS certificate")
101+
return
102+
}
103+
cert = &tempCert
104+
}
105+
89106
config := proxy.Config{
90107
Connector: *connector,
91108
PrefillerUseTLS: *prefillerUseTLS,
92-
SecureProxy: *secureProxy,
93-
CertPath: *certPath,
94109
PrefillerInsecureSkipVerify: *prefillerInsecureSkipVerify,
95110
DecoderInsecureSkipVerify: *decoderInsecureSkipVerify,
96-
EnableSSRFProtection: *enableSSRFProtection,
97-
InferencePoolNamespace: *inferencePoolNamespace,
98-
InferencePoolName: *inferencePoolName,
111+
DataParallelSize: *vLLMDataParallelSize,
99112
}
100113

101-
proxy, err := proxy.NewProxy(*port, targetURL, config)
114+
// Create SSRF protection validator
115+
validator, err := proxy.NewAllowlistValidator(*enableSSRFProtection, *inferencePoolNamespace, *inferencePoolName)
102116
if err != nil {
103-
logger.Error(err, "Failed to create proxy")
117+
logger.Error(err, "failed to create SSRF protection validator")
118+
return
104119
}
105-
if err := proxy.Start(ctx); err != nil {
120+
121+
proxyServer := proxy.NewProxy(*port, targetURL, config)
122+
123+
if err := proxyServer.Start(ctx, cert, validator); err != nil {
106124
logger.Error(err, "failed to start proxy server")
107125
}
108126
}

deploy/components/crds-gie/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
1010
kind: Kustomization
1111

1212
resources:
13-
- https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v1.0.0
13+
- https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v1.1.0-rc.1

deploy/components/inference-gateway/deployments.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ spec:
2323
args:
2424
- --pool-name
2525
- "${POOL_NAME}"
26-
- "--pool-group"
27-
- "inference.networking.x-k8s.io"
2826
- --v
2927
- "4"
3028
- --zap-encoder

deploy/components/inference-gateway/httproutes.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
type: PathPrefix
1212
value: /
1313
backendRefs:
14-
- group: inference.networking.x-k8s.io
14+
- group: inference.networking.k8s.io
1515
kind: InferencePool
1616
name: ${POOL_NAME}
1717
port: 8000
Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1-
apiVersion: inference.networking.x-k8s.io/v1alpha2
1+
apiVersion: inference.networking.k8s.io/v1
22
kind: InferencePool
33
metadata:
44
name: ${POOL_NAME}
55
spec:
6-
targetPortNumber: 8000
76
selector:
8-
app: ${POOL_NAME}
9-
extensionRef:
7+
matchLabels:
8+
app: ${POOL_NAME}
9+
endpointPickerRef:
1010
name: ${EPP_NAME}
11+
kind: Service
12+
port:
13+
number: 9002
14+
targetPorts:
15+
- number: ${TARGET_PORTS}

deploy/components/inference-gateway/rbac.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ rules:
1212
- "get"
1313
- "watch"
1414
- "list"
15+
- apiGroups:
16+
- "inference.networking.k8s.io"
17+
resources:
18+
- "inferencepools"
19+
verbs:
20+
- "get"
21+
- "watch"
22+
- "list"
1523
- apiGroups:
1624
- ""
1725
resources:

deploy/components/istio-control-plane/rbac.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,21 @@ rules:
330330
verbs:
331331
- update
332332
- patch
333+
- apiGroups:
334+
- "inference.networking.k8s.io"
335+
resources:
336+
- "inferencepools"
337+
verbs:
338+
- "get"
339+
- "watch"
340+
- "list"
341+
- apiGroups:
342+
- "inference.networking.k8s.io"
343+
resources:
344+
- inferencepools/status
345+
verbs:
346+
- update
347+
- patch
333348
- apiGroups:
334349
- ""
335350
resources:

deploy/components/vllm-sim-pd/deployments.yaml

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,72 @@ spec:
5555
- "--port=8000"
5656
- "--vllm-port=8200"
5757
- "--connector=lmcache"
58+
- "--secure-proxy=false"
59+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
5860
ports:
59-
- containerPort: 8000
61+
- name: sidecar-http
62+
containerPort: 8000
63+
protocol: TCP
64+
- name: sidecar-rank1
65+
containerPort: 8001
66+
protocol: TCP
67+
- name: sidecar-rank2
68+
containerPort: 8002
69+
protocol: TCP
70+
- name: sidecar-rank3
71+
containerPort: 8003
72+
protocol: TCP
73+
- name: sidecar-rank4
74+
containerPort: 8004
75+
protocol: TCP
76+
- name: sidecar-rank5
77+
containerPort: 8005
78+
protocol: TCP
79+
- name: sidecar-rank6
80+
containerPort: 8006
81+
protocol: TCP
82+
- name: sidecar-rank7
83+
containerPort: 8007
6084
protocol: TCP
6185
restartPolicy: Always
86+
env:
87+
- name: POD_IP
88+
valueFrom:
89+
fieldRef:
90+
fieldPath: status.podIP
6291
containers:
6392
- name: vllm
6493
image: ghcr.io/llm-d/llm-d-inference-sim:latest
6594
imagePullPolicy: IfNotPresent
6695
args:
6796
- "--port=8200"
6897
- "--model=${MODEL_NAME}"
98+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
6999
ports:
70100
- name: http
71101
containerPort: 8200
72102
protocol: TCP
103+
- name: rank1
104+
containerPort: 8201
105+
protocol: TCP
106+
- name: rank2
107+
containerPort: 8202
108+
protocol: TCP
109+
- name: rank3
110+
containerPort: 8203
111+
protocol: TCP
112+
- name: rank4
113+
containerPort: 8204
114+
protocol: TCP
115+
- name: rank5
116+
containerPort: 8205
117+
protocol: TCP
118+
- name: rank6
119+
containerPort: 8206
120+
protocol: TCP
121+
- name: rank7
122+
containerPort: 8207
123+
protocol: TCP
73124
env:
74125
- name: PORT
75126
value: "8200"

deploy/components/vllm-sim/deployments.yaml

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,89 @@ spec:
1414
labels:
1515
app: ${POOL_NAME}
1616
spec:
17+
initContainers:
18+
- name: routing-sidecar
19+
image: ghcr.io/llm-d/llm-d-routing-sidecar:latest
20+
imagePullPolicy: IfNotPresent
21+
args:
22+
- "--port=8000"
23+
- "--vllm-port=8200"
24+
- "--connector=lmcache"
25+
- "--secure-proxy=false"
26+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
27+
ports:
28+
- name: sidecar-http
29+
containerPort: 8000
30+
protocol: TCP
31+
- name: sidecar-rank1
32+
containerPort: 8001
33+
protocol: TCP
34+
- name: sidecar-rank2
35+
containerPort: 8002
36+
protocol: TCP
37+
- name: sidecar-rank3
38+
containerPort: 8003
39+
protocol: TCP
40+
- name: sidecar-rank4
41+
containerPort: 8004
42+
protocol: TCP
43+
- name: sidecar-rank5
44+
containerPort: 8005
45+
protocol: TCP
46+
- name: sidecar-rank6
47+
containerPort: 8006
48+
protocol: TCP
49+
- name: sidecar-rank7
50+
containerPort: 8007
51+
protocol: TCP
52+
restartPolicy: Always
53+
env:
54+
- name: POD_IP
55+
valueFrom:
56+
fieldRef:
57+
fieldPath: status.podIP
1758
containers:
1859
- name: vllm
1960
image: ghcr.io/llm-d/llm-d-inference-sim:latest
2061
imagePullPolicy: IfNotPresent
2162
args:
22-
- "--port=8000"
63+
- "--port=8200"
2364
- "--model=${MODEL_NAME}"
2465
- "--enable-kvcache=${KV_CACHE_ENABLED}"
2566
- "--kv-cache-size=1024"
2667
- "--block-size=16"
2768
- "--zmq-endpoint=tcp://${EPP_NAME}.default.svc.cluster.local:5557"
2869
- "--event-batch-size=16"
2970
- "--tokenizers-cache-dir=/tokenizer-cache"
71+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
3072
ports:
3173
- name: http
32-
containerPort: 8000
74+
containerPort: 8200
75+
protocol: TCP
76+
- name: rank1
77+
containerPort: 8201
78+
protocol: TCP
79+
- name: rank2
80+
containerPort: 8202
81+
protocol: TCP
82+
- name: rank3
83+
containerPort: 8203
84+
protocol: TCP
85+
- name: rank4
86+
containerPort: 8204
87+
protocol: TCP
88+
- name: rank5
89+
containerPort: 8205
90+
protocol: TCP
91+
- name: rank6
92+
containerPort: 8206
93+
protocol: TCP
94+
- name: rank7
95+
containerPort: 8207
3396
protocol: TCP
3497
env:
3598
- name: PORT
36-
value: "8000"
99+
value: "8200"
37100
- name: PYTHONHASHSEED
38101
value: "42"
39102
volumeMounts:

0 commit comments

Comments
 (0)