|
| 1 | +apiVersion: v1 |
| 2 | +kind: PersistentVolume |
| 3 | +metadata: |
| 4 | + name: ec-cache-pv |
| 5 | +spec: |
| 6 | + capacity: |
| 7 | + storage: 5Gi |
| 8 | + accessModes: |
| 9 | + - ReadWriteMany |
| 10 | + storageClassName: manual |
| 11 | + hostPath: |
| 12 | + path: /tmp/vllm-ec-cache |
| 13 | +--- |
| 14 | +apiVersion: v1 |
| 15 | +kind: PersistentVolumeClaim |
| 16 | +metadata: |
| 17 | + name: ec-cache-pvc |
| 18 | +spec: |
| 19 | + storageClassName: manual |
| 20 | + accessModes: |
| 21 | + - ReadWriteMany |
| 22 | + resources: |
| 23 | + requests: |
| 24 | + storage: 5Gi |
| 25 | +--- |
| 26 | +apiVersion: apps/v1 |
| 27 | +kind: Deployment |
| 28 | +metadata: |
| 29 | + name: vllm-encoder |
| 30 | + labels: |
| 31 | + app: ${POOL_NAME} |
| 32 | +spec: |
| 33 | + replicas: ${VLLM_REPLICA_COUNT_E} |
| 34 | + selector: |
| 35 | + matchLabels: |
| 36 | + app: ${POOL_NAME} |
| 37 | + template: |
| 38 | + metadata: |
| 39 | + labels: |
| 40 | + app: ${POOL_NAME} |
| 41 | + llm-d.ai/role: encode |
| 42 | + spec: |
| 43 | + containers: |
| 44 | + - name: vllm |
| 45 | + image: ${VLLM_SIMULATOR_IMAGE} |
| 46 | + imagePullPolicy: IfNotPresent |
| 47 | + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
| 48 | + args: |
| 49 | + - "--model=${MODEL_NAME}" |
| 50 | + - "--port=8000" |
| 51 | + - "--gpu-memory-utilization=0.7" |
| 52 | + - "--mm-processor-kwargs" |
| 53 | + - '{"max_pixels": 313600}' |
| 54 | + - "--enforce-eager" |
| 55 | + - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}" |
| 56 | + - "--no-enable-prefix-caching" |
| 57 | + - "--max-num-batched-tokens=114688" |
| 58 | + - "--mm-encoder-only" |
| 59 | + - "--ec-transfer-config" |
| 60 | + - '{"ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}' |
| 61 | + ports: |
| 62 | + - name: http |
| 63 | + containerPort: 8000 |
| 64 | + protocol: TCP |
| 65 | + env: |
| 66 | + - name: PORT |
| 67 | + value: "8000" |
| 68 | + - name: HUGGINGFACE_HUB_CACHE |
| 69 | + value: "/data" |
| 70 | + - name: TRITON_CACHE_DIR |
| 71 | + value: "/.triton-cache" |
| 72 | + volumeMounts: |
| 73 | + - mountPath: /shared-ec-cache |
| 74 | + name: ec-cache |
| 75 | + - mountPath: /data |
| 76 | + name: data |
| 77 | + - mountPath: /dev/shm |
| 78 | + name: shm |
| 79 | + - name: metrics-volume |
| 80 | + mountPath: /.config |
| 81 | + - name: torch-compile-cache |
| 82 | + mountPath: /.cache |
| 83 | + - name: triton-cache |
| 84 | + mountPath: /.triton-cache |
| 85 | + restartPolicy: Always |
| 86 | + terminationGracePeriodSeconds: 30 |
| 87 | + volumes: |
| 88 | + - name: ec-cache |
| 89 | + persistentVolumeClaim: |
| 90 | + claimName: ec-cache-pvc |
| 91 | + - name: data |
| 92 | + emptyDir: {} |
| 93 | + - name: shm |
| 94 | + emptyDir: |
| 95 | + medium: Memory |
| 96 | + - name: metrics-volume |
| 97 | + emptyDir: {} |
| 98 | + - name: torch-compile-cache |
| 99 | + emptyDir: {} |
| 100 | + - name: triton-cache |
| 101 | + emptyDir: {} |
| 102 | +--- |
| 103 | +apiVersion: apps/v1 |
| 104 | +kind: Deployment |
| 105 | +metadata: |
| 106 | + name: vllm-pd |
| 107 | + labels: |
| 108 | + app: ${POOL_NAME} |
| 109 | +spec: |
| 110 | + replicas: ${VLLM_REPLICA_COUNT_D} |
| 111 | + selector: |
| 112 | + matchLabels: |
| 113 | + app: ${POOL_NAME} |
| 114 | + template: |
| 115 | + metadata: |
| 116 | + labels: |
| 117 | + app: ${POOL_NAME} |
| 118 | + llm-d.ai/role: prefill-decode |
| 119 | + spec: |
| 120 | + initContainers: |
| 121 | + - name: routing-sidecar |
| 122 | + image: ${SIDECAR_IMAGE} |
| 123 | + imagePullPolicy: IfNotPresent |
| 124 | + args: |
| 125 | + - "--port=8000" |
| 126 | + - "--vllm-port=8200" |
| 127 | + - "--secure-proxy=false" |
| 128 | + - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}" |
| 129 | + ports: |
| 130 | + - name: sidecar-http |
| 131 | + containerPort: 8000 |
| 132 | + protocol: TCP |
| 133 | + - name: sidecar-rank1 |
| 134 | + containerPort: 8001 |
| 135 | + protocol: TCP |
| 136 | + - name: sidecar-rank2 |
| 137 | + containerPort: 8002 |
| 138 | + protocol: TCP |
| 139 | + - name: sidecar-rank3 |
| 140 | + containerPort: 8003 |
| 141 | + protocol: TCP |
| 142 | + - name: sidecar-rank4 |
| 143 | + containerPort: 8004 |
| 144 | + protocol: TCP |
| 145 | + - name: sidecar-rank5 |
| 146 | + containerPort: 8005 |
| 147 | + protocol: TCP |
| 148 | + - name: sidecar-rank6 |
| 149 | + containerPort: 8006 |
| 150 | + protocol: TCP |
| 151 | + - name: sidecar-rank7 |
| 152 | + containerPort: 8007 |
| 153 | + protocol: TCP |
| 154 | + restartPolicy: Always |
| 155 | + env: |
| 156 | + - name: POD_IP |
| 157 | + valueFrom: |
| 158 | + fieldRef: |
| 159 | + fieldPath: status.podIP |
| 160 | + containers: |
| 161 | + - name: vllm |
| 162 | + image: ${VLLM_SIMULATOR_IMAGE} |
| 163 | + imagePullPolicy: IfNotPresent |
| 164 | + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
| 165 | + args: |
| 166 | + - "--model=${MODEL_NAME}" |
| 167 | + - "--port=8200" |
| 168 | + - "--gpu-memory-utilization=0.7" |
| 169 | + - "--max-model-len=32768" |
| 170 | + - "--mm-processor-kwargs" |
| 171 | + - '{"max_pixels": 313600}' |
| 172 | + - "--enforce-eager" |
| 173 | + - "--max-num-seqs=1" |
| 174 | + - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}" |
| 175 | + - "--ec-transfer-config" |
| 176 | + - '{"ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}' |
| 177 | + ports: |
| 178 | + - name: http |
| 179 | + containerPort: 8200 |
| 180 | + protocol: TCP |
| 181 | + - name: rank1 |
| 182 | + containerPort: 8201 |
| 183 | + protocol: TCP |
| 184 | + - name: rank2 |
| 185 | + containerPort: 8202 |
| 186 | + protocol: TCP |
| 187 | + - name: rank3 |
| 188 | + containerPort: 8203 |
| 189 | + protocol: TCP |
| 190 | + - name: rank4 |
| 191 | + containerPort: 8204 |
| 192 | + protocol: TCP |
| 193 | + - name: rank5 |
| 194 | + containerPort: 8205 |
| 195 | + protocol: TCP |
| 196 | + - name: rank6 |
| 197 | + containerPort: 8206 |
| 198 | + protocol: TCP |
| 199 | + - name: rank7 |
| 200 | + containerPort: 8207 |
| 201 | + protocol: TCP |
| 202 | + env: |
| 203 | + - name: PORT |
| 204 | + value: "8200" |
| 205 | + - name: HUGGINGFACE_HUB_CACHE |
| 206 | + value: "/data" |
| 207 | + - name: TRITON_CACHE_DIR |
| 208 | + value: "/.triton-cache" |
| 209 | + volumeMounts: |
| 210 | + - mountPath: /shared-ec-cache |
| 211 | + name: ec-cache |
| 212 | + - mountPath: /data |
| 213 | + name: data |
| 214 | + - mountPath: /dev/shm |
| 215 | + name: shm |
| 216 | + - name: metrics-volume |
| 217 | + mountPath: /.config |
| 218 | + - name: torch-compile-cache |
| 219 | + mountPath: /.cache |
| 220 | + - name: triton-cache |
| 221 | + mountPath: /.triton-cache |
| 222 | + restartPolicy: Always |
| 223 | + terminationGracePeriodSeconds: 30 |
| 224 | + volumes: |
| 225 | + - name: ec-cache |
| 226 | + persistentVolumeClaim: |
| 227 | + claimName: ec-cache-pvc |
| 228 | + - name: data |
| 229 | + emptyDir: {} |
| 230 | + - name: shm |
| 231 | + emptyDir: |
| 232 | + medium: Memory |
| 233 | + - name: metrics-volume |
| 234 | + emptyDir: {} |
| 235 | + - name: torch-compile-cache |
| 236 | + emptyDir: {} |
| 237 | + - name: triton-cache |
| 238 | + emptyDir: {} |
0 commit comments