Skip to content

Commit 6b7d48c

Browse files
committed
more parallelism configuration
Signed-off-by: Michael Kalantar <kalantar@us.ibm.com>
1 parent 07c7e36 commit 6b7d48c

4 files changed

Lines changed: 20 additions & 240 deletions

File tree

scenarios/guides/wide-ep-lws.sh

Lines changed: 0 additions & 234 deletions
This file was deleted.

setup/env.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,9 @@ export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=${LLMDBENCH_VLLM_MODELSERVICE
352352
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS:-deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark}
353353
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_MEM_UTIL=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_MEM_UTIL:-$LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEM_UTIL}
354354
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM:-1}
355+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_LOCAL_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_LOCAL_PARALLELISM:-1}
355356
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM:-1}
357+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NUM_WORKERS=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_NUM_WORKERS:-1}
356358
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR:-auto}
357359
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE:-auto}
358360
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=${LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE:-$LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE}
@@ -374,7 +376,9 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=${LLMDBENCH_VLLM_MODELSERVIC
374376
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PODANNOTATIONS=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PODANNOTATIONS:-deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark}
375377
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_MEM_UTIL:-$LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEM_UTIL}
376378
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM:-1}
379+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_LOCAL_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_LOCAL_PARALLELISM:-1}
377380
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM:-1}
381+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NUM_WORKERS=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NUM_WORKERS:-1}
378382
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR:-auto}
379383
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE:-auto}
380384
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=${LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE:-$LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE}

setup/steps/09_deploy_via_modelservice.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ def generate_ms_values_yaml(
150150
decode_replicas = int(ev.get("vllm_modelservice_decode_replicas", "0"))
151151
decode_create = "true" if decode_replicas > 0 else "false"
152152
decode_data_parallelism = ev.get("vllm_modelservice_decode_data_parallelism", "1")
153-
decode_tensor_parallelism = ev["vllm_modelservice_decode_tensor_parallelism"]
153+
decode_data_local_parallelism = ev.get("vllm_modelservice_decode_data_local_parallelism", "1")
154+
decode_tensor_parallelism = ev.get("vllm_modelservice_decode_tensor_parallelism", "1")
155+
decode_workers_parallelism = ev.get("vllm_modelservice_decode_worker_parallelism", "1")
154156
decode_model_command = ev.get("vllm_modelservice_decode_model_command", "")
155157
decode_extra_args = ev.get("vllm_modelservice_decode_extra_args", "")
156158
decode_inference_port = ev["vllm_modelservice_decode_inference_port"]
@@ -159,9 +161,9 @@ def generate_ms_values_yaml(
159161
prefill_replicas = int(ev.get("vllm_modelservice_prefill_replicas", "0"))
160162
prefill_create = "true" if prefill_replicas > 0 else "false"
161163
prefill_data_parallelism = ev.get("vllm_modelservice_prefill_data_parallelism", "1")
162-
prefill_tensor_parallelism = ev.get(
163-
"vllm_modelservice_prefill_tensor_parallelism", "1"
164-
)
164+
prefill_data_local_parallelism = ev.get("vllm_modelservice_prefill_data_local_parallelism", "1")
165+
prefill_tensor_parallelism = ev.get("vllm_modelservice_prefill_tensor_parallelism", "1")
166+
prefill_workers_parallelism = ev.get("vllm_modelservice_prefill_worker_parallelism", "1")
165167
prefill_model_command = ev.get("vllm_modelservice_prefill_model_command", "")
166168
prefill_extra_args = ev.get("vllm_modelservice_prefill_extra_args", "")
167169
prefill_inference_port = ev["vllm_modelservice_prefill_inference_port"]
@@ -245,7 +247,9 @@ def generate_ms_values_yaml(
245247
{add_affinity(ev)}
246248
parallelism:
247249
data: {decode_data_parallelism}
250+
dataLocal: {decode_data_local_parallelism}
248251
tensor: {decode_tensor_parallelism}
252+
workers: {decode_workers_parallelism}
249253
annotations:
250254
{add_annotations("LLMDBENCH_VLLM_COMMON_ANNOTATIONS").lstrip()}
251255
podAnnotations:
@@ -300,7 +304,9 @@ def generate_ms_values_yaml(
300304
{add_affinity(ev)}
301305
parallelism:
302306
data: {prefill_data_parallelism}
307+
dataLocal: {prefill_data_local_parallelism}
303308
tensor: {prefill_tensor_parallelism}
309+
workers: {prefill_workers_parallelism}
304310
annotations:
305311
{add_annotations("LLMDBENCH_VLLM_COMMON_ANNOTATIONS").lstrip()}
306312
podAnnotations:

workload/report/convert.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,19 +223,23 @@ def _get_llmd_benchmark_envars() -> dict:
223223
"accelerator": [{
224224
"model": os.environ['LLMDBENCH_VLLM_COMMON_AFFINITY'].split(':', 1)[-1],
225225
"count": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM'])
226-
* int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM']),
226+
* int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_LOCAL_PARALLELISM']),
227227
"parallelism": {
228228
"tp": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM']),
229229
"dp": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM']),
230+
"dpLocal": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_LOCAL_PARALLELISM']),
231+
"workers": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NUM_WORKERS']),
230232
},
231233
}] * int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS']) +
232234
[{
233235
"model": os.environ['LLMDBENCH_VLLM_COMMON_AFFINITY'].split(':', 1)[-1],
234236
"count": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM'])
235-
* int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM']),
237+
* int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_LOCAL_PARALLELISM']),
236238
"parallelism": {
237239
"tp": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM']),
238240
"dp": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM']),
241+
"dpLocal": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_LOCAL_PARALLELISM']),
242+
"workers": int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_NUM_WORKERS']),
239243
},
240244
}] * int(os.environ['LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS']),
241245
},

0 commit comments

Comments
 (0)