Skip to content

Commit bb49b50

Browse files
committed
try to match infra cicd
Signed-off-by: Michael Kalantar <kalantar@us.ibm.com>
1 parent 927259c commit bb49b50

File tree

1 file changed

+86
-23
lines changed

1 file changed

+86
-23
lines changed

scenarios/examples/wide-ep-lws.sh

Lines changed: 86 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# P/D DISAGGREGATION WELL LIT PATH
2-
# Based on https://github.com/llm-d-incubation/llm-d-infra/tree/main/quickstart/examples/pd-disaggregation
1+
# WIDE EP WELL LIT PATH
2+
# Based on https://github.com/llm-d-incubation/llm-d-infra/tree/main/quickstart/examples/wide-ep-lws
33
# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
44
# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware.
55
# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed.
@@ -11,43 +11,84 @@
1111
# Model parameters
1212
# export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
1313
# export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
14-
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15-
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
14+
# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15+
export LLMDBENCH_DEPLOY_MODEL_LIST=deepseek-ai/DeepSeek-V2-Lite
16+
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=30Gi
1617

1718
# Workload parameters
1819
export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=random_concurrent.yaml
1920
export LLMDBENCH_HARNESS_NAME=vllm-benchmark
2021

2122
# Routing configuration (via gaie)
23+
export LLMDBENCH_LLMD_INFERENCESCHEDULER_IMAGE_TAG=v0.2.1
2224
export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=pd-config.yaml
25+
export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$(mktemp)
26+
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS}
27+
pd-config.yaml: |
28+
# ALWAYS DO PD IN THIS EXAMPLE (THRESHOLD 0)
29+
apiVersion: inference.networking.x-k8s.io/v1alpha1
30+
kind: EndpointPickerConfig
31+
plugins:
32+
- type: prefill-header-handler
33+
- type: prefill-filter
34+
- type: decode-filter
35+
- type: max-score-picker
36+
- type: queue-scorer
37+
parameters:
38+
hashBlockSize: 5
39+
maxPrefixBlocksToMatch: 256
40+
lruCapacityPerServer: 31250
41+
- type: pd-profile-handler
42+
parameters:
43+
threshold: 0
44+
hashBlockSize: 5
45+
schedulingProfiles:
46+
- name: prefill
47+
plugins:
48+
- pluginRef: prefill-filter
49+
- pluginRef: queue-scorer
50+
weight: 1.0
51+
- pluginRef: max-score-picker
52+
- name: decode
53+
plugins:
54+
- pluginRef: decode-filter
55+
- pluginRef: queue-scorer
56+
weight: 1.0
57+
- pluginRef: max-score-picker
58+
EOF
2359

2460
# Routing configuration (via modelservice)
2561
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true
2662
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL=true
2763
export LLMDBENCH_VLLM_MODELSERVICE_EPP=true
64+
65+
export LLMDBENCH_LLMD_ROUTINGSIDECAR_IMAGE_TAG=v0.2.0
2866
# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
29-
export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=3
67+
# export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=3
3068

3169
# Prefill and Decode configiration (via modelservice)
3270

3371
export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
3472

73+
# export LLMDBENCH_LLMD_IMAGE_NAME=llm-d-dev@sha256
74+
# export LLMDBENCH_LLMD_IMAGE_TAG=dcb6b80a53d058e62dcbfc1166bf9e78419a62ea1e424489c85bc872f229a8e7
75+
export LLMDBENCH_LLMD_IMAGE_NAME=llm-d
76+
export LLMDBENCH_LLMD_IMAGE_TAG=v0.2.0
77+
3578
export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp)
3679
cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
3780
- name: VLLM_FUSED_MOE_CHUNK_SIZE
3881
value: "1024"
3982
- name: DP_SIZE_LOCAL
40-
value: "2"
83+
value: "1"
4184
- name: TRITON_LIBCUDA_PATH
4285
value: "/usr/lib64"
4386
- name: VLLM_SKIP_P2P_CHECK
4487
value: "1"
4588
- name: VLLM_RANDOMIZE_DP_DUMMY_INPUTS
4689
value: "1"
47-
- name: VLLM_USE_DEEP_GEMM
48-
value: "1"
4990
- name: VLLM_ALL2ALL_BACKEND
50-
value: "deepep_low_latency"
91+
value: "naive"
5192
- name: NVIDIA_GDRCOPY
5293
value: "enabled"
5394
- name: NVSHMEM_DEBUG
@@ -66,23 +107,39 @@ cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
66107
value: "ibp"
67108
- name: VLLM_LOGGING_LEVEL
68109
value: "INFO"
69-
- name: HF_HUB_CACHE
70-
value: "/model-cache/models"
71110
EOF
72111

73112
# export LLMDBENCH_VLLM_MODELSERVICE_MOUNT_MODEL_VOLUME_OVERRIDE=false
74113
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
75114
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=2
76-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=2
77-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
78-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
115+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
116+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=1
117+
# MK not in infra cicd
118+
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=32
119+
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=512Gi
120+
# end MK
121+
# Uncomment the following line to enable multi-nic
122+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$(id -un),modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
123+
# Uncomment the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
124+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
125+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=4
126+
# MK not in infra cicd
127+
# export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE:-}
128+
# export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=${LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR:-}
129+
# export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_RESOURCE=ephemeral-storage
130+
# export LLMDBENCH_VLLM_COMMON_EPHEMERAL_STORAGE_NR=64Gi
131+
# end MK
132+
79133
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
134+
80135
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$(mktemp)
81136
cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
82137
START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
83138
source /opt/vllm/bin/activate
84-
exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
85-
--port 8200 \
139+
exec vllm serve /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
140+
--port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT \
141+
--enforce-eager \
142+
--max-model-len 4096 \
86143
--disable-log-requests \
87144
--disable-uvicorn-access-log \
88145
--enable-expert-parallel \
@@ -94,7 +151,7 @@ START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
94151
--data-parallel-rpc-port 5555 \
95152
--data-parallel-start-rank \$START_RANK \
96153
--trust-remote-code \
97-
--kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
154+
--kv_transfer_config "{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}"
98155
EOF
99156
export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG=$(mktemp)
100157
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG}
@@ -117,24 +174,30 @@ cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES}
117174
EOF
118175

119176
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=1
177+
120178
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
121179
# Uncomment the following line to enable multi-nic
122180
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$(id -un),modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
123181
# Uncomment the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
124182
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
125183
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=4
126-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=1
184+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=2
127185
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
128-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
129-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
186+
# MK not in infra cicd
187+
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
188+
# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
189+
# end MK
190+
130191
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=custom
131192
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS=$(mktemp)
132193
cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
133194
START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
134195
135196
source /opt/vllm/bin/activate
136-
exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
137-
--port 8000 \
197+
exec vllm serve /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
198+
--port REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_INFERENCE_PORT \
199+
--enforce-eager \
200+
--max-model-len 4096 \
138201
--disable-log-requests \
139202
--disable-uvicorn-access-log \
140203
--enable-expert-parallel \
@@ -146,5 +209,5 @@ START_RANK=\$(( \${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
146209
--data-parallel-rpc-port 5555 \
147210
--data-parallel-start-rank \$START_RANK \
148211
--trust-remote-code \
149-
--kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
212+
--kv_transfer_config "{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}"
150213
EOF

0 commit comments

Comments
 (0)