1- # P/D DISAGGREGATION WELL LIT PATH
2- # Based on https://github.com/llm-d-incubation/llm-d-infra/tree/main/quickstart/examples/pd-disaggregation
1+ # WIDE EP WELL LIT PATH
2+ # Based on https://github.com/llm-d-incubation/llm-d-infra/tree/main/quickstart/examples/wide-ep-lws
33# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
44# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware.
55# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed.
1111# Model parameters
1212# export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
1313# export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
14- export LLMDBENCH_DEPLOY_MODEL_LIST=" meta-llama/Llama-3.1-8B-Instruct"
15- export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
14+ # export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15+ export LLMDBENCH_DEPLOY_MODEL_LIST=deepseek-ai/DeepSeek-V2-Lite
16+ export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=30Gi
1617
1718# Workload parameters
1819export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=random_concurrent.yaml
1920export LLMDBENCH_HARNESS_NAME=vllm-benchmark
2021
2122# Routing configuration (via gaie)
23+ export LLMDBENCH_LLMD_INFERENCESCHEDULER_IMAGE_TAG=v0.2.1
2224export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=pd-config.yaml
25+ export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$( mktemp)
26+ cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS}
27+ pd-config.yaml: |
28+ # ALWAYS DO PD IN THIS EXAMPLE (THRESHOLD 0)
29+ apiVersion: inference.networking.x-k8s.io/v1alpha1
30+ kind: EndpointPickerConfig
31+ plugins:
32+ - type: prefill-header-handler
33+ - type: prefill-filter
34+ - type: decode-filter
35+ - type: max-score-picker
36+ - type: queue-scorer
37+ parameters:
38+ hashBlockSize: 5
39+ maxPrefixBlocksToMatch: 256
40+ lruCapacityPerServer: 31250
41+ - type: pd-profile-handler
42+ parameters:
43+ threshold: 0
44+ hashBlockSize: 5
45+ schedulingProfiles:
46+ - name: prefill
47+ plugins:
48+ - pluginRef: prefill-filter
49+ - pluginRef: queue-scorer
50+ weight: 1.0
51+ - pluginRef: max-score-picker
52+ - name: decode
53+ plugins:
54+ - pluginRef: decode-filter
55+ - pluginRef: queue-scorer
56+ weight: 1.0
57+ - pluginRef: max-score-picker
58+ EOF
2359
2460# Routing configuration (via modelservice)
2561export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true
2662export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL=true
2763export LLMDBENCH_VLLM_MODELSERVICE_EPP=true
64+
65+ export LLMDBENCH_LLMD_ROUTINGSIDECAR_IMAGE_TAG=v0.2.0
2866# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
29- export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=3
67+ # export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=3
3068
3169# Prefill and Decode configiration (via modelservice)
3270
3371export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
3472
73+ # export LLMDBENCH_LLMD_IMAGE_NAME=llm-d-dev@sha256
74+ # export LLMDBENCH_LLMD_IMAGE_TAG=dcb6b80a53d058e62dcbfc1166bf9e78419a62ea1e424489c85bc872f229a8e7
75+ export LLMDBENCH_LLMD_IMAGE_NAME=llm-d
76+ export LLMDBENCH_LLMD_IMAGE_TAG=v0.2.0
77+
3578export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$( mktemp)
3679cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
3780- name: VLLM_FUSED_MOE_CHUNK_SIZE
3881 value: "1024"
3982- name: DP_SIZE_LOCAL
40- value: "2 "
83+ value: "1 "
4184- name: TRITON_LIBCUDA_PATH
4285 value: "/usr/lib64"
4386- name: VLLM_SKIP_P2P_CHECK
4487 value: "1"
4588- name: VLLM_RANDOMIZE_DP_DUMMY_INPUTS
4689 value: "1"
47- - name: VLLM_USE_DEEP_GEMM
48- value: "1"
4990- name: VLLM_ALL2ALL_BACKEND
50- value: "deepep_low_latency "
91+ value: "naive "
5192- name: NVIDIA_GDRCOPY
5293 value: "enabled"
5394- name: NVSHMEM_DEBUG
@@ -66,23 +107,39 @@ cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
66107 value: "ibp"
67108- name: VLLM_LOGGING_LEVEL
68109 value: "INFO"
69- - name: HF_HUB_CACHE
70- value: "/model-cache/models"
71110EOF
72111
73112# export LLMDBENCH_VLLM_MODELSERVICE_MOUNT_MODEL_VOLUME_OVERRIDE=false
74113export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
75114export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=2
76- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=2
77- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
78- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
115+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
116+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=1
117+ # MK not in infra cicd
118+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=32
119+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=512Gi
120+ # end MK
121+ # Uncomment the following line to enable multi-nic
122+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$(id -un),modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
123+ # Uncomment the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
124+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
125+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=4
126+ # MK not in infra cicd
127+ # export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE:-}
128+ # export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR:-}
129+ # export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=ephemeral-storage
130+ # export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=64Gi
131+ # end MK
132+
79133export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
134+
80135export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$( mktemp)
81136cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
82137START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
83138 source /opt/vllm/bin/activate
84- exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
85- --port 8200 \
139+ exec vllm serve /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
140+ --port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT \
141+ --enforce-eager \
142+ --max-model-len 4096 \
86143--disable-log-requests \
87144--disable-uvicorn-access-log \
88145--enable-expert-parallel \
@@ -94,7 +151,7 @@ START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
94151--data-parallel-rpc-port 5555 \
95152--data-parallel-start-rank \$ START_RANK \
96153--trust-remote-code \
97- --kv_transfer_config '{ "kv_connector": "NixlConnector", "kv_role": "kv_both"}'
154+ --kv_transfer_config "{\ "kv_connector\":\ "NixlConnector\",\ "kv_role\":\ "kv_both\"}"
98155EOF
99156export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG=$( mktemp)
100157cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG}
@@ -117,24 +174,30 @@ cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES}
117174EOF
118175
119176export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=1
177+
120178export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
121179# Uncomment the following line to enable multi-nic
122180# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$(id -un),modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
123181# Uncomment the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
124182# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
125183# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=4
126- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=1
184+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=2
127185export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
128- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
129- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
186+ # MK not in infra cicd
187+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
188+ # export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
189+ # end MK
190+
130191export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
131192export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$( mktemp)
132193cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
133194START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
134195
135196 source /opt/vllm/bin/activate
136- exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
137- --port 8000 \
197+ exec vllm serve /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
198+ --port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT \
199+ --enforce-eager \
200+ --max-model-len 4096 \
138201--disable-log-requests \
139202--disable-uvicorn-access-log \
140203--enable-expert-parallel \
@@ -146,5 +209,5 @@ START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
146209--data-parallel-rpc-port 5555 \
147210--data-parallel-start-rank \$ START_RANK \
148211--trust-remote-code \
149- --kv_transfer_config '{ "kv_connector": "NixlConnector", "kv_role": "kv_both"}'
212+ --kv_transfer_config "{\ "kv_connector\":\ "NixlConnector\",\ "kv_role\":\ "kv_both\"}"
150213EOF
0 commit comments