Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 238 additions & 0 deletions deploy/components/vllm-sim-epd/deployments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: ec-cache-pv
spec:
capacity:
storage: 5Gi
accessModes:
- ReadWriteMany
storageClassName: manual
hostPath:
path: /tmp/vllm-ec-cache
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ec-cache-pvc
spec:
storageClassName: manual
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-encoder
labels:
app: ${POOL_NAME}
spec:
replicas: ${VLLM_REPLICA_COUNT_E}
selector:
matchLabels:
app: ${POOL_NAME}
template:
metadata:
labels:
app: ${POOL_NAME}
llm-d.ai/role: encode
spec:
containers:
- name: vllm
image: ${VLLM_SIMULATOR_IMAGE}
imagePullPolicy: IfNotPresent
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model=${MODEL_NAME}"
- "--port=8000"
- "--gpu-memory-utilization=0.7"
- "--mm-processor-kwargs"
- '{"max_pixels": 313600}'
- "--enforce-eager"
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
- "--no-enable-prefix-caching"
- "--max-num-batched-tokens=114688"
- "--mm-encoder-only"
- "--ec-transfer-config"
- '{"ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
ports:
- name: http
containerPort: 8000
protocol: TCP
env:
- name: PORT
value: "8000"
- name: HUGGINGFACE_HUB_CACHE
value: "/data"
- name: TRITON_CACHE_DIR
value: "/.triton-cache"
volumeMounts:
- mountPath: /shared-ec-cache
name: ec-cache
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: metrics-volume
mountPath: /.config
- name: torch-compile-cache
mountPath: /.cache
- name: triton-cache
mountPath: /.triton-cache
restartPolicy: Always
terminationGracePeriodSeconds: 30
volumes:
- name: ec-cache
persistentVolumeClaim:
claimName: ec-cache-pvc
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: metrics-volume
emptyDir: {}
- name: torch-compile-cache
emptyDir: {}
- name: triton-cache
emptyDir: {}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-pd
labels:
app: ${POOL_NAME}
spec:
replicas: ${VLLM_REPLICA_COUNT_D}
selector:
matchLabels:
app: ${POOL_NAME}
template:
metadata:
labels:
app: ${POOL_NAME}
llm-d.ai/role: prefill-decode
spec:
initContainers:
- name: routing-sidecar
image: ${SIDECAR_IMAGE}
imagePullPolicy: IfNotPresent
args:
- "--port=8000"
- "--vllm-port=8200"
- "--secure-proxy=false"
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
ports:
- name: sidecar-http
containerPort: 8000
protocol: TCP
- name: sidecar-rank1
containerPort: 8001
protocol: TCP
- name: sidecar-rank2
containerPort: 8002
protocol: TCP
- name: sidecar-rank3
containerPort: 8003
protocol: TCP
- name: sidecar-rank4
containerPort: 8004
protocol: TCP
- name: sidecar-rank5
containerPort: 8005
protocol: TCP
- name: sidecar-rank6
containerPort: 8006
protocol: TCP
- name: sidecar-rank7
containerPort: 8007
protocol: TCP
restartPolicy: Always
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
containers:
- name: vllm
image: ${VLLM_SIMULATOR_IMAGE}
imagePullPolicy: IfNotPresent
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model=${MODEL_NAME}"
- "--port=8200"
- "--gpu-memory-utilization=0.7"
- "--max-model-len=32768"
- "--mm-processor-kwargs"
- '{"max_pixels": 313600}'
- "--enforce-eager"
- "--max-num-seqs=1"
- "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
- "--ec-transfer-config"
- '{"ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
ports:
- name: http
containerPort: 8200
protocol: TCP
- name: rank1
containerPort: 8201
protocol: TCP
- name: rank2
containerPort: 8202
protocol: TCP
- name: rank3
containerPort: 8203
protocol: TCP
- name: rank4
containerPort: 8204
protocol: TCP
- name: rank5
containerPort: 8205
protocol: TCP
- name: rank6
containerPort: 8206
protocol: TCP
- name: rank7
containerPort: 8207
protocol: TCP
env:
- name: PORT
value: "8200"
- name: HUGGINGFACE_HUB_CACHE
value: "/data"
- name: TRITON_CACHE_DIR
value: "/.triton-cache"
volumeMounts:
- mountPath: /shared-ec-cache
name: ec-cache
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: metrics-volume
mountPath: /.config
- name: torch-compile-cache
mountPath: /.cache
- name: triton-cache
mountPath: /.triton-cache
restartPolicy: Always
terminationGracePeriodSeconds: 30
volumes:
- name: ec-cache
persistentVolumeClaim:
claimName: ec-cache-pvc
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: metrics-volume
emptyDir: {}
- name: torch-compile-cache
emptyDir: {}
- name: triton-cache
emptyDir: {}
12 changes: 12 additions & 0 deletions deploy/components/vllm-sim-epd/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ------------------------------------------------------------------------------
# VLLM Simulator
#
# This deploys a VLLM simulator which can be used to simulate inference for
# small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when
# all that is needed is some basic functionality.
# ------------------------------------------------------------------------------
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- deployments.yaml
23 changes: 23 additions & 0 deletions deploy/config/sim-epd-epp-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Sample EPP configuration for tunning with E/PD
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
featureGates:
- prepareDataPlugins
plugins:
- type: encoder-header-handler
- type: queue-scorer
- type: encode-filter
- type: decode-filter
- type: always-encode-decider
- type: ed-profile-handler
parameters:
deciderPluginName: always-encode-decider
schedulingProfiles:
- name: encode
plugins:
- pluginRef: encode-filter
- name: decode
plugins:
- pluginRef: decode-filter
- pluginRef: queue-scorer
weight: 1
16 changes: 16 additions & 0 deletions deploy/environments/dev/kind-istio-epd/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# ------------------------------------------------------------------------------
# Kubernetes In Docker (KIND) Environment
#
# This will deploy the full development stack on a KIND cluster:
#
# * Istio Control Plane
# * Real VLLM (EPD mode)
# * Inference Gateway
#
# ------------------------------------------------------------------------------
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../base-kind-istio/
- ../../../components/vllm-sim-epd/
Loading