@@ -30,11 +30,14 @@ export LLMDBENCH_VLLM_COMMON_EXTRA_PVC_NAME=spyre-precompiled-model
3030
3131# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=istio
3232
33+ # export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
34+
3335export LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE=ibm.com/spyre_vf
3436export LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM=4
3537export LLMDBENCH_VLLM_COMMON_AFFINITY=" ibm.com/spyre.product:IBM_Spyre"
3638export LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS=1024
3739export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=32768
40+ # export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=4096,,32768
3841export LLMDBENCH_VLLM_COMMON_MAX_NUM_SEQ=32
3942export LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS=1024
4043export LLMDBENCH_VLLM_COMMON_CPU_NR=100
@@ -110,6 +113,7 @@ cat << EOF > $LLMDBENCH_VLLM_COMMON_EXTRA_VOLUME_MOUNTS
110113- name: preprocesses
111114 mountPath: /setup/preprocess
112115EOF
116+
113117export LLMDBENCH_VLLM_COMMON_EXTRA_VOLUMES=$( mktemp)
114118cat << EOF > $LLMDBENCH_VLLM_COMMON_EXTRA_VOLUMES
115119- name: spyre-precompiled-model
@@ -134,7 +138,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
134138export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=$LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE
135139
136140# Decode parameters: 2 decode pods
137- export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
141+ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
138142export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=${LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM}
139143export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=$LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE
140144export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=$LLMDBENCH_VLLM_COMMON_CPU_NR
@@ -151,13 +155,13 @@ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
151155REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS; \
152156/home/senuser/container-scripts/simple_vllm_serve.sh /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
153157--served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
154- --port REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT \
155- --max-model-len REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN \
156- --tensor-parallel-size REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM \
157- --max-num-seqs REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_NUM_SEQ \
158+ --port \$ VLLM_METRICS_PORT \
159+ --max-model-len \$ VLLM_MAX_MODEL_LEN \
160+ --tensor-parallel-size \$ VLLM_TENSOR_PARALLELISM \
161+ --max-num-seq \$ VLLM_MAX_NUM_SEQ \
158162--enable-auto-tool-choice \
159163--tool-call-parser granite \
160- --max-num-batched-tokens REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS \
164+ --max-num-batched-tokens \$ VLLM_MAX_NUM_BATCHED_TOKENS \
161165--enable-prefix-caching
162166EOF
163167
0 commit comments