1+ export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
2+
13# WIDE EP/DP WITH LWS WELL LIT PATH
24# Based on https://github.com/llm-d/llm-d/tree/main/guides/wide-ep-lws/README.md
35# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
@@ -20,7 +22,33 @@ export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
2022export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
2123
2224# Routing configuration (via gaie)
23- export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=custom-plugins.yaml
25+ export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=" custom-plugins.yaml"
26+ export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$( mktemp)
27+ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS
28+ custom-plugins.yaml: |
29+ apiVersion: inference.networking.x-k8s.io/v1alpha1
30+ kind: EndpointPickerConfig
31+ plugins:
32+ - type: prefill-header-handler
33+ - type: prefill-filter
34+ - type: decode-filter
35+ - type: random-picker
36+ parameters:
37+ maxNumOfEndpoints: 1
38+ - type: pd-profile-handler
39+ parameters:
40+ threshold: 0
41+ hashBlockSize: 5
42+ schedulingProfiles:
43+ - name: prefill
44+ plugins:
45+ - pluginRef: prefill-filter
46+ - pluginRef: random-picker
47+ - name: decode
48+ plugins:
49+ - pluginRef: decode-filter
50+ - pluginRef: random-picker
51+ EOF
2452
2553# Routing configuration (via modelservice)
2654# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
@@ -124,10 +152,10 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1
124152export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EPHEMERAL_STORAGE_NR=1Ti
125153export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT=8000
126154export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
127- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS=" python3 /setup/preprocess/set_llmdbench_environment.py; source \$ HOME/llmdbench_env.sh"
155+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
128156export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$( mktemp)
129157cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
130- exec vllm serve \
158+ find /dev/shm -type f -delete; START_RANK= \$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
131159 REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
132160 --port 8000 \
133161 --trust-remote-code \
@@ -234,13 +262,12 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
234262export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EPHEMERAL_STORAGE_NR=1Ti
235263export LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT=8200
236264export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
237- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS=" python3 /setup/preprocess/set_llmdbench_environment.py; source \$ HOME/llmdbench_env.sh"
265+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS="python3 /setup/preprocess/set_llmdbench_environment.py; source \$HOME/llmdbench_env.sh"
238266export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$( mktemp)
239- cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
240267# Clear /dev/shm on start to prevent running out of space when crashes occur
241268# https://github.com/llm-d/llm-d/issues/352
242- find /dev/shm -type f -delete; \
243- exec vllm serve \
269+ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
270+ find /dev/shm -type f -delete; START_RANK= \$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )); exec vllm serve \
244271 REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
245272 --port 8200 \
246273 --trust-remote-code \
0 commit comments