Skip to content

Commit 09ad012

Browse files
committed
Initial E/PD extension for kind deployment
Signed-off-by: Revital Sur <eres@il.ibm.com>
1 parent a0c8d17 commit 09ad012

File tree

5 files changed

+333
-23
lines changed

5 files changed

+333
-23
lines changed
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: ec-cache-pv
5+
spec:
6+
capacity:
7+
storage: 5Gi
8+
accessModes:
9+
- ReadWriteMany
10+
storageClassName: manual
11+
hostPath:
12+
path: /tmp/vllm-ec-cache
13+
---
14+
apiVersion: v1
15+
kind: PersistentVolumeClaim
16+
metadata:
17+
name: ec-cache-pvc
18+
spec:
19+
storageClassName: manual
20+
accessModes:
21+
- ReadWriteMany
22+
resources:
23+
requests:
24+
storage: 5Gi
25+
---
26+
apiVersion: apps/v1
27+
kind: Deployment
28+
metadata:
29+
name: vllm-encoder
30+
labels:
31+
app: ${POOL_NAME}
32+
spec:
33+
replicas: ${VLLM_REPLICA_COUNT_E}
34+
selector:
35+
matchLabels:
36+
app: ${POOL_NAME}
37+
template:
38+
metadata:
39+
labels:
40+
app: ${POOL_NAME}
41+
llm-d.ai/role: encode
42+
spec:
43+
containers:
44+
- name: vllm
45+
image: ${VLLM_SIMULATOR_IMAGE}
46+
imagePullPolicy: IfNotPresent
47+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
48+
args:
49+
- "--model=${MODEL_NAME}"
50+
- "--port=8000"
51+
- "--gpu-memory-utilization=0.7"
52+
- "--mm-processor-kwargs"
53+
- '{"max_pixels": 313600}'
54+
- "--enforce-eager"
55+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
56+
- "--no-enable-prefix-caching"
57+
- "--max-num-batched-tokens=114688"
58+
- "--mm-encoder-only"
59+
- "--ec-transfer-config"
60+
- '{"ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
61+
ports:
62+
- name: http
63+
containerPort: 8000
64+
protocol: TCP
65+
env:
66+
- name: PORT
67+
value: "8000"
68+
- name: HUGGINGFACE_HUB_CACHE
69+
value: "/data"
70+
- name: TRITON_CACHE_DIR
71+
value: "/.triton-cache"
72+
volumeMounts:
73+
- mountPath: /shared-ec-cache
74+
name: ec-cache
75+
- mountPath: /data
76+
name: data
77+
- mountPath: /dev/shm
78+
name: shm
79+
- name: metrics-volume
80+
mountPath: /.config
81+
- name: torch-compile-cache
82+
mountPath: /.cache
83+
- name: triton-cache
84+
mountPath: /.triton-cache
85+
restartPolicy: Always
86+
terminationGracePeriodSeconds: 30
87+
volumes:
88+
- name: ec-cache
89+
persistentVolumeClaim:
90+
claimName: ec-cache-pvc
91+
- name: data
92+
emptyDir: {}
93+
- name: shm
94+
emptyDir:
95+
medium: Memory
96+
- name: metrics-volume
97+
emptyDir: {}
98+
- name: torch-compile-cache
99+
emptyDir: {}
100+
- name: triton-cache
101+
emptyDir: {}
102+
---
103+
apiVersion: apps/v1
104+
kind: Deployment
105+
metadata:
106+
name: vllm-pd
107+
labels:
108+
app: ${POOL_NAME}
109+
spec:
110+
replicas: ${VLLM_REPLICA_COUNT_D}
111+
selector:
112+
matchLabels:
113+
app: ${POOL_NAME}
114+
template:
115+
metadata:
116+
labels:
117+
app: ${POOL_NAME}
118+
llm-d.ai/role: prefill-decode
119+
spec:
120+
initContainers:
121+
- name: routing-sidecar
122+
image: ${SIDECAR_IMAGE}
123+
imagePullPolicy: IfNotPresent
124+
args:
125+
- "--port=8000"
126+
- "--vllm-port=8200"
127+
- "--secure-proxy=false"
128+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
129+
ports:
130+
- name: sidecar-http
131+
containerPort: 8000
132+
protocol: TCP
133+
- name: sidecar-rank1
134+
containerPort: 8001
135+
protocol: TCP
136+
- name: sidecar-rank2
137+
containerPort: 8002
138+
protocol: TCP
139+
- name: sidecar-rank3
140+
containerPort: 8003
141+
protocol: TCP
142+
- name: sidecar-rank4
143+
containerPort: 8004
144+
protocol: TCP
145+
- name: sidecar-rank5
146+
containerPort: 8005
147+
protocol: TCP
148+
- name: sidecar-rank6
149+
containerPort: 8006
150+
protocol: TCP
151+
- name: sidecar-rank7
152+
containerPort: 8007
153+
protocol: TCP
154+
restartPolicy: Always
155+
env:
156+
- name: POD_IP
157+
valueFrom:
158+
fieldRef:
159+
fieldPath: status.podIP
160+
containers:
161+
- name: vllm
162+
image: ${VLLM_SIMULATOR_IMAGE}
163+
imagePullPolicy: IfNotPresent
164+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
165+
args:
166+
- "--model=${MODEL_NAME}"
167+
- "--port=8200"
168+
- "--gpu-memory-utilization=0.7"
169+
- "--max-model-len=32768"
170+
- "--mm-processor-kwargs"
171+
- '{"max_pixels": 313600}'
172+
- "--enforce-eager"
173+
- "--max-num-seqs=1"
174+
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
175+
- "--ec-transfer-config"
176+
- '{"ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
177+
ports:
178+
- name: http
179+
containerPort: 8200
180+
protocol: TCP
181+
- name: rank1
182+
containerPort: 8201
183+
protocol: TCP
184+
- name: rank2
185+
containerPort: 8202
186+
protocol: TCP
187+
- name: rank3
188+
containerPort: 8203
189+
protocol: TCP
190+
- name: rank4
191+
containerPort: 8204
192+
protocol: TCP
193+
- name: rank5
194+
containerPort: 8205
195+
protocol: TCP
196+
- name: rank6
197+
containerPort: 8206
198+
protocol: TCP
199+
- name: rank7
200+
containerPort: 8207
201+
protocol: TCP
202+
env:
203+
- name: PORT
204+
value: "8200"
205+
- name: HUGGINGFACE_HUB_CACHE
206+
value: "/data"
207+
- name: TRITON_CACHE_DIR
208+
value: "/.triton-cache"
209+
volumeMounts:
210+
- mountPath: /shared-ec-cache
211+
name: ec-cache
212+
- mountPath: /data
213+
name: data
214+
- mountPath: /dev/shm
215+
name: shm
216+
- name: metrics-volume
217+
mountPath: /.config
218+
- name: torch-compile-cache
219+
mountPath: /.cache
220+
- name: triton-cache
221+
mountPath: /.triton-cache
222+
restartPolicy: Always
223+
terminationGracePeriodSeconds: 30
224+
volumes:
225+
- name: ec-cache
226+
persistentVolumeClaim:
227+
claimName: ec-cache-pvc
228+
- name: data
229+
emptyDir: {}
230+
- name: shm
231+
emptyDir:
232+
medium: Memory
233+
- name: metrics-volume
234+
emptyDir: {}
235+
- name: torch-compile-cache
236+
emptyDir: {}
237+
- name: triton-cache
238+
emptyDir: {}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# ------------------------------------------------------------------------------
2+
# VLLM Simulator
3+
#
4+
# This deploys a VLLM simulator which can be used to simulate inference for
5+
# small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when
6+
# all that is needed is some basic functionality.
7+
# ------------------------------------------------------------------------------
8+
apiVersion: kustomize.config.k8s.io/v1beta1
9+
kind: Kustomization
10+
11+
resources:
12+
- deployments.yaml
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Sample EPP configuration for tunning with E/PD
2+
apiVersion: inference.networking.x-k8s.io/v1alpha1
3+
kind: EndpointPickerConfig
4+
featureGates:
5+
- prepareDataPlugins
6+
plugins:
7+
- type: encoder-header-handler
8+
- type: queue-scorer
9+
- type: encode-filter
10+
- type: decode-filter
11+
- type: always-encode-decider
12+
- type: ed-profile-handler
13+
parameters:
14+
deciderPluginName: always-encode-decider
15+
schedulingProfiles:
16+
- name: encode
17+
plugins:
18+
- pluginRef: encode-filter
19+
weight: 2
20+
- name: decode
21+
plugins:
22+
- pluginRef: decode-filter
23+
- pluginRef: queue-scorer
24+
weight: 1
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# ------------------------------------------------------------------------------
2+
# Kubernetes In Docker (KIND) Environment
3+
#
4+
# This will deploy the full development stack on a KIND cluster:
5+
#
6+
# * Istio Control Plane
7+
# * Real VLLM (EPD mode)
8+
# * Inference Gateway
9+
#
10+
# ------------------------------------------------------------------------------
11+
apiVersion: kustomize.config.k8s.io/v1beta1
12+
kind: Kustomization
13+
14+
resources:
15+
- ../base-kind-istio/
16+
- ../../../components/vllm-sim-epd/

0 commit comments

Comments
 (0)