33# It's purpose is to drive development of setup/steps/09_deploy_via_modelservice.sh
44
55# Fill in required/desired values
6- export LLMDBENCH_HF_TOKEN=
6+ # export LLMDBENCH_HF_TOKEN=
77# export LLMDBENCH_VLLM_COMMON_NAMESPACE=
88# export LLMDBENCH_CONTROL_WORK_DIR=
99
10- # Cluster specific configuration (fusion6/pokprod001)
11- export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
10+ # Cluster specific configuration
11+ # export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
1212export LLMDBENCH_VLLM_COMMON_AFFINITY=' nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3'
1313
1414# Model(s)
1515export LLMDBENCH_DEPLOY_MODEL_LIST=" Qwen/Qwen3-0.6B"
16- # export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=800Gi
16+ export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=20Gi
1717
18- # modelservice configuration
18+ # Routing configuration (via modelservice)
1919
2020export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true
2121
2222export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2
2323export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=3
2424
25+ # Prefill and Decode configiration (via modelservice)
26+
2527export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
2628
27- export LLMDBENCH_VLLM_STANDALONE_VLLM_FUSED_MOE_CHUNK_SIZE=" 1024"
28- export LLMDBENCH_VLLM_STANDALONE_DP_SIZE_LOCAL=" 2"
29- export LLMDBENCH_VLLM_STANDALONE_TRITON_LIBCUDA_PATH=" /usr/lib64"
30- # export LLMDBENCH_VLLM_STANDALONE_HF_HUB_DISABLE_XET="1"
31- export LLMDBENCH_VLLM_STANDALONE_VLLM_SKIP_P2P_CHECK=" 1"
32- export LLMDBENCH_VLLM_STANDALONE_VLLM_RANDOMIZE_DP_DUMMY_INPUTS=" 1"
33- export LLMDBENCH_VLLM_STANDALONE_VLLM_USE_DEEP_GEMM=" 1"
34- export LLMDBENCH_VLLM_STANDALONE_VLLM_ALL2ALL_BACKEND=" deepep_low_latency"
35- export LLMDBENCH_VLLM_STANDALONE_NVIDIA_GDRCOPY=" enabled"
36- export LLMDBENCH_VLLM_STANDALONE_NVSHMEM_DEBUG=" INFO"
37- export LLMDBENCH_VLLM_STANDALONE_NVSHMEM_REMOTE_TRANSPORT=" ibgda"
38- export LLMDBENCH_VLLM_STANDALONE_NVSHMEM_IB_ENABLE_IBGDA=" true"
39- export LLMDBENCH_VLLM_STANDALONE_NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=" eth0"
40- export LLMDBENCH_VLLM_STANDALONE_GLOO_SOCKET_IFNAME=" eth0"
41- export LLMDBENCH_VLLM_STANDALONE_NCCL_SOCKET_IFNAME=" eth0"
42- export LLMDBENCH_VLLM_STANDALONE_NCCL_IB_HCA=" ibp"
43- export LLMDBENCH_VLLM_STANDALONE_VLLM_LOGGING_LEVEL=" INFO"
44- # export LLMDBENCH_VLLM_STANDALONE_HF_HUB_CACHE="/huggingface-cache"
45- export LLMDBENCH_VLLM_STANDALONE_HF_HUB_CACHE=" /model-cache/models"
29+ export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$( mktemp)
30+ cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
31+ - name: VLLM_FUSED_MOE_CHUNK_SIZE
32+ value: "1024"
33+ - name: DP_SIZE_LOCAL
34+ value: "2"
35+ - name: TRITON_LIBCUDA_PATH
36+ value: "/usr/lib64"
37+ - name: VLLM_SKIP_P2P_CHECK
38+ value: "1"
39+ - name: VLLM_RANDOMIZE_DP_DUMMY_INPUTS
40+ value: "1"
41+ - name: VLLM_USE_DEEP_GEMM
42+ value: "1"
43+ - name: VLLM_ALL2ALL_BACKEND
44+ value: "deepep_low_latency"
45+ - name: NVIDIA_GDRCOPY
46+ value: "enabled"
47+ - name: NVSHMEM_DEBUG
48+ value: "INFO"
49+ - name: NVSHMEM_REMOTE_TRANSPORT
50+ value: "ibgda"
51+ - name: NVSHMEM_IB_ENABLE_IBGDA
52+ value: "true"
53+ - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME
54+ value: "eth0"
55+ - name: GLOO_SOCKET_IFNAME
56+ value: "eth0"
57+ - name: NCCL_SOCKET_IFNAME
58+ value: "eth0"
59+ - name: NCCL_IB_HCA
60+ value: "ibp"
61+ - name: VLLM_LOGGING_LEVEL
62+ value: "INFO"
63+ - name: HF_HUB_CACHE
64+ value: "/model-cache/models"
65+ EOF
4666
4767# export LLMDBENCH_VLLM_MODELSERVICE_MOUNT_MODEL_VOLUME_OVERRIDE=false
4868export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
@@ -52,40 +72,45 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=custom
5272export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS=$( mktemp)
5373cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
5474START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
55-
5675 source /opt/vllm/bin/activate
57- exec vllm serve \
58- /model-cache/models/Qwen/Qwen3-0.6B \
59- --port 8200 \
60- --disable-log-requests \
61- --disable-uvicorn-access-log \
62- --enable-expert-parallel \
63- --data-parallel-hybrid-lb \
64- --tensor-parallel-size \$ TP_SIZE \
65- --data-parallel-size \$ ((LWS_GROUP_SIZE * DP_SIZE_LOCAL)) \
66- --data-parallel-size-local \$ DP_SIZE_LOCAL \
67- --data-parallel-address \$ {LWS_LEADER_ADDRESS} \
68- --data-parallel-rpc-port 5555 \
69- --data-parallel-start-rank \$ START_RANK \
70- --trust-remote-code \
71- --kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
76+ exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
77+ --port 8200 \
78+ --disable-log-requests \
79+ --disable-uvicorn-access-log \
80+ --enable-expert-parallel \
81+ --data-parallel-hybrid-lb \
82+ --tensor-parallel-size \$ TP_SIZE \
83+ --data-parallel-size \$ ((LWS_GROUP_SIZE * DP_SIZE_LOCAL)) \
84+ --data-parallel-size-local \$ DP_SIZE_LOCAL \
85+ --data-parallel-address \$ {LWS_LEADER_ADDRESS} \
86+ --data-parallel-rpc-port 5555 \
87+ --data-parallel-start-rank \$ START_RANK \
88+ --trust-remote-code \
89+ --kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
7290EOF
73- export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=" LLMDBENCH_VLLM_STANDALONE_VLLM_FUSED_MOE_CHUNK_SIZE,LLMDBENCH_VLLM_STANDALONE_DP_SIZE_LOCAL,LLMDBENCH_VLLM_STANDALONE_TRITON_LIBCUDA_PATH,LLMDBENCH_VLLM_STANDALONE_VLLM_SKIP_P2P_CHECK,LLMDBENCH_VLLM_STANDALONE_VLLM_RANDOMIZE_DP_DUMMY_INPUTS,LLMDBENCH_VLLM_STANDALONE_VLLM_USE_DEEP_GEMM,LLMDBENCH_VLLM_STANDALONE_VLLM_ALL2ALL_BACKEND,LLMDBENCH_VLLM_STANDALONE_NVIDIA_GDRCOPY,LLMDBENCH_VLLM_STANDALONE_NVSHMEM_DEBUG,LLMDBENCH_VLLM_STANDALONE_NVSHMEM_REMOTE_TRANSPORT,LLMDBENCH_VLLM_STANDALONE_NVSHMEM_IB_ENABLE_IBGDA,LLMDBENCH_VLLM_STANDALONE_NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME,LLMDBENCH_VLLM_STANDALONE_GLOO_SOCKET_IFNAME,LLMDBENCH_VLLM_STANDALONE_NCCL_SOCKET_IFNAME,LLMDBENCH_VLLM_STANDALONE_NCCL_IB_HCA,LLMDBENCH_VLLM_STANDALONE_VLLM_LOGGING_LEVEL,LLMDBENCH_VLLM_STANDALONE_HF_HUB_CACHE"
7491export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG=$( mktemp)
7592cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG}
7693workingDir: /code
7794imagePullPolicy: Always
78- # securityContext:
79- # runAsUser: 0
80- # runAsGroup: 0
81- # capabilities:
82- # add:
83- # - "IPC_LOCK"
84- # - "SYS_RAWIO"
8595EOF
96+
8697export LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE=" nvidia.com/gpu"
8798export LLMDBENCH_VLLM_COMMON_ACCELERATOR_NR=2
8899
100+ export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS=$( mktemp)
101+ cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS}
102+ - name: dshm
103+ mountPath: /dev/shm
104+ EOF
105+
106+ export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES=$( mktemp)
107+ cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES}
108+ - name: dshm
109+ emptyDir:
110+ medium: Memory
111+ sizeLimit: 1Gi
112+ EOF
113+
89114export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=1
90115export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM=1
91116export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM=1
@@ -95,19 +120,18 @@ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS
95120START_RANK=\$ (( \$ {LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
96121
97122 source /opt/vllm/bin/activate
98- exec vllm serve \
99- Qwen/Qwen3-0.6B \
100- --port 8000 \
101- --disable-log-requests \
102- --disable-uvicorn-access-log \
103- --enable-expert-parallel \
104- --data-parallel-hybrid-lb \
105- --tensor-parallel-size \$ TP_SIZE \
106- --data-parallel-size \$ ((LWS_GROUP_SIZE * DP_SIZE_LOCAL)) \
107- --data-parallel-size-local \$ DP_SIZE_LOCAL \
108- --data-parallel-address \$ {LWS_LEADER_ADDRESS} \
109- --data-parallel-rpc-port 5555 \
110- --data-parallel-start-rank \$ START_RANK \
111- --trust-remote-code \
112- --kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
123+ exec vllm serve /model-cache/models/Qwen/Qwen3-0.6B \
124+ --port 8000 \
125+ --disable-log-requests \
126+ --disable-uvicorn-access-log \
127+ --enable-expert-parallel \
128+ --data-parallel-hybrid-lb \
129+ --tensor-parallel-size \$ TP_SIZE \
130+ --data-parallel-size \$ ((LWS_GROUP_SIZE * DP_SIZE_LOCAL)) \
131+ --data-parallel-size-local \$ DP_SIZE_LOCAL \
132+ --data-parallel-address \$ {LWS_LEADER_ADDRESS} \
133+ --data-parallel-rpc-port 5555 \
134+ --data-parallel-start-rank \$ START_RANK \
135+ --trust-remote-code \
136+ --kv_transfer_config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
113137EOF
0 commit comments