Initial E/PD extension for kind deployment

revit13 · revit13 · commit 09ad0120f47d · 2026-02-24T07:38:47.000+02:00
Signed-off-by: Revital Sur &lt;eres@il.ibm.com&gt;
diff --git a/deploy/components/vllm-sim-epd/deployments.yaml b/deploy/components/vllm-sim-epd/deployments.yaml
@@ -0,0 +1,238 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: ec-cache-pv
+spec:
+  capacity:
+    storage: 5Gi
+  accessModes:
+    - ReadWriteMany
+  storageClassName: manual
+  hostPath:
+    path: /tmp/vllm-ec-cache
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ec-cache-pvc
+spec:
+  storageClassName: manual
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 5Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-encoder
+  labels:
+    app: ${POOL_NAME}
+spec:
+  replicas: ${VLLM_REPLICA_COUNT_E}
+  selector:
+    matchLabels:
+      app: ${POOL_NAME}
+  template:
+    metadata:
+      labels:
+        app: ${POOL_NAME}
+        llm-d.ai/role: encode
+    spec:
+      containers:
+      - name: vllm
+        image: ${VLLM_SIMULATOR_IMAGE}
+        imagePullPolicy: IfNotPresent
+        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+        args:
+        - "--model=${MODEL_NAME}"
+        - "--port=8000"
+        - "--gpu-memory-utilization=0.7"
+        - "--mm-processor-kwargs"
+        - '{"max_pixels": 313600}'
+        - "--enforce-eager"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
+        - "--no-enable-prefix-caching"
+        - "--max-num-batched-tokens=114688"
+        - "--mm-encoder-only"
+        - "--ec-transfer-config"
+        - '{"ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
+        ports:
+        - name: http
+          containerPort: 8000
+          protocol: TCP
+        env:
+        - name: PORT
+          value: "8000"
+        - name: HUGGINGFACE_HUB_CACHE
+          value: "/data"
+        - name: TRITON_CACHE_DIR
+          value: "/.triton-cache"
+        volumeMounts:
+        - mountPath: /shared-ec-cache
+          name: ec-cache
+        - mountPath: /data
+          name: data
+        - mountPath: /dev/shm
+          name: shm
+        - name: metrics-volume
+          mountPath: /.config
+        - name: torch-compile-cache
+          mountPath: /.cache
+        - name: triton-cache
+          mountPath: /.triton-cache
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      volumes:
+      - name: ec-cache
+        persistentVolumeClaim:
+          claimName: ec-cache-pvc
+      - name: data
+        emptyDir: {}
+      - name: shm
+        emptyDir:
+          medium: Memory
+      - name: metrics-volume
+        emptyDir: {}
+      - name: torch-compile-cache
+        emptyDir: {}
+      - name: triton-cache
+        emptyDir: {}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-pd
+  labels:
+    app: ${POOL_NAME}
+spec:
+  replicas: ${VLLM_REPLICA_COUNT_D}
+  selector:
+    matchLabels:
+      app: ${POOL_NAME}
+  template:
+    metadata:
+      labels:
+        app: ${POOL_NAME}
+        llm-d.ai/role: prefill-decode
+    spec:
+      initContainers:
+      - name: routing-sidecar
+        image: ${SIDECAR_IMAGE}
+        imagePullPolicy: IfNotPresent
+        args:
+        - "--port=8000"
+        - "--vllm-port=8200"
+        - "--secure-proxy=false"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
+        ports:
+        - name: sidecar-http
+          containerPort: 8000
+          protocol: TCP
+        - name: sidecar-rank1
+          containerPort: 8001
+          protocol: TCP
+        - name: sidecar-rank2
+          containerPort: 8002
+          protocol: TCP
+        - name: sidecar-rank3
+          containerPort: 8003
+          protocol: TCP
+        - name: sidecar-rank4
+          containerPort: 8004
+          protocol: TCP
+        - name: sidecar-rank5
+          containerPort: 8005
+          protocol: TCP
+        - name: sidecar-rank6
+          containerPort: 8006
+          protocol: TCP
+        - name: sidecar-rank7
+          containerPort: 8007
+          protocol: TCP
+        restartPolicy: Always
+        env:
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+      containers:
+      - name: vllm
+        image: ${VLLM_SIMULATOR_IMAGE}
+        imagePullPolicy: IfNotPresent
+        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+        args:
+        - "--model=${MODEL_NAME}"
+        - "--port=8200"
+        - "--gpu-memory-utilization=0.7"
+        - "--max-model-len=32768"
+        - "--mm-processor-kwargs"
+        - '{"max_pixels": 313600}'
+        - "--enforce-eager"
+        - "--max-num-seqs=1"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
+        - "--ec-transfer-config"
+        - '{"ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": {"shared_storage_path": "/shared-ec-cache"}}'
+        ports:
+        - name: http
+          containerPort: 8200
+          protocol: TCP
+        - name: rank1
+          containerPort: 8201
+          protocol: TCP
+        - name: rank2
+          containerPort: 8202
+          protocol: TCP
+        - name: rank3
+          containerPort: 8203
+          protocol: TCP
+        - name: rank4
+          containerPort: 8204
+          protocol: TCP
+        - name: rank5
+          containerPort: 8205
+          protocol: TCP
+        - name: rank6
+          containerPort: 8206
+          protocol: TCP
+        - name: rank7
+          containerPort: 8207
+          protocol: TCP
+        env:
+        - name: PORT
+          value: "8200"
+        - name: HUGGINGFACE_HUB_CACHE
+          value: "/data"
+        - name: TRITON_CACHE_DIR
+          value: "/.triton-cache"
+        volumeMounts:
+        - mountPath: /shared-ec-cache
+          name: ec-cache
+        - mountPath: /data
+          name: data
+        - mountPath: /dev/shm
+          name: shm
+        - name: metrics-volume
+          mountPath: /.config
+        - name: torch-compile-cache
+          mountPath: /.cache
+        - name: triton-cache
+          mountPath: /.triton-cache
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      volumes:
+      - name: ec-cache
+        persistentVolumeClaim:
+          claimName: ec-cache-pvc
+      - name: data
+        emptyDir: {}
+      - name: shm
+        emptyDir:
+          medium: Memory
+      - name: metrics-volume
+        emptyDir: {}
+      - name: torch-compile-cache
+        emptyDir: {}
+      - name: triton-cache
+        emptyDir: {}
diff --git a/deploy/components/vllm-sim-epd/kustomization.yaml b/deploy/components/vllm-sim-epd/kustomization.yaml
@@ -0,0 +1,12 @@
+# ------------------------------------------------------------------------------
+# VLLM Simulator
+#
+# This deploys a VLLM simulator which can be used to simulate inference for
+# small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when
+# all that is needed is some basic functionality.
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- deployments.yaml
diff --git a/deploy/config/sim-epd-epp-config.yaml b/deploy/config/sim-epd-epp-config.yaml
@@ -0,0 +1,24 @@
+# Sample EPP configuration for tunning with E/PD
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+featureGates:
+- prepareDataPlugins
+plugins:
+- type: encoder-header-handler
+- type: queue-scorer
+- type: encode-filter
+- type: decode-filter
+- type: always-encode-decider
+- type: ed-profile-handler
+  parameters:
+    deciderPluginName: always-encode-decider
+schedulingProfiles:
+- name: encode
+  plugins:
+  - pluginRef: encode-filter
+    weight: 2
+- name: decode
+  plugins:
+  - pluginRef: decode-filter
+  - pluginRef: queue-scorer
+    weight: 1
diff --git a/deploy/environments/dev/kind-istio-epd/kustomization.yaml b/deploy/environments/dev/kind-istio-epd/kustomization.yaml
@@ -0,0 +1,16 @@
+# ------------------------------------------------------------------------------
+# Kubernetes In Docker (KIND) Environment
+#
+# This will deploy the full development stack on a KIND cluster:
+#
+#  * Istio Control Plane
+#  * Real VLLM (EPD mode)
+#  * Inference Gateway
+#
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../base-kind-istio/
+- ../../../components/vllm-sim-epd/
diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh