gaie config for wide-ep

kalantar · kalantar · commit bf87bc61a745 · 2025-12-15T17:54:40.000-05:00
Signed-off-by: Michael Kalantar &lt;kalantar@us.ibm.com&gt;
diff --git a/scenarios/guides/wide-ep-lws.sh b/scenarios/guides/wide-ep-lws.sh
@@ -1,3 +1,5 @@
+export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
+
 # WIDE EP/DP WITH LWS WELL LIT PATH
 # Based on https://github.com/llm-d/llm-d/tree/main/guides/wide-ep-lws/README.md
 # Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
@@ -20,7 +22,33 @@ export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
 export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
 
 # Routing configuration (via gaie)
-export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=custom-plugins.yaml
+export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="custom-plugins.yaml"
+export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$(mktemp)
+cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS
+custom-plugins.yaml: |
+  apiVersion: inference.networking.x-k8s.io/v1alpha1
+  kind: EndpointPickerConfig
+  plugins:
+  - type: prefill-header-handler
+  - type: prefill-filter
+  - type: decode-filter
+  - type: random-picker
+    parameters:
+      maxNumOfEndpoints: 1
+  - type: pd-profile-handler
+    parameters:
+      threshold: 0
+      hashBlockSize: 5
+  schedulingProfiles:
+  - name: prefill
+    plugins:
+    - pluginRef: prefill-filter
+    - pluginRef: random-picker
+  - name: decode
+    plugins:
+    - pluginRef: decode-filter
+    - pluginRef: random-picker
+EOF
 
 # Routing configuration (via modelservice)
 # export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
@@ -124,10 +152,10 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR=1Ti
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT=8000
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
-export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
+# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$(mktemp)
 cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
-exec vllm serve \
+find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
   REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
   --port 8000 \
   --trust-remote-code \
@@ -234,13 +262,12 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR=1Ti
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT=8200
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
-export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
+# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$(mktemp)
-cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
 # Clear /dev/shm on start to prevent running out of space when crashes occur
 # https://github.com/llm-d/llm-d/issues/352
-find /dev/shm -type f -delete; \
-exec vllm serve \
+cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
+find /dev/shm -type f -delete; START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
   REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
   --port 8200 \
   --trust-remote-code \
diff --git a/setup/steps/09_deploy_via_modelservice.py b/setup/steps/09_deploy_via_modelservice.py
@@ -291,7 +291,7 @@ def generate_ms_values_yaml(
       readinessProbe:
         httpGet:
           path: /health
-          port: 8200
+          port: {decode_inference_port}
         failureThreshold: 3
         periodSeconds: 5
       {add_config(decode_extra_container_config, 6).lstrip()}