Skip to content

Commit a1d0ce3

Browse files
[Standup] Allow per-pod VLLM cli values. (llm-d#710)
A simple example, two decode `pods` with different `--max-model-len` ``` export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=4096,,32768 ``` The double-comma is used to "protect" VLLM parameters which do contain commas, such as `--model-loader-extra-config`. Signed-off-by: maugustosilva <maugusto.silva@gmail.com>
1 parent 311db72 commit a1d0ce3

File tree

7 files changed

+46
-20
lines changed

7 files changed

+46
-20
lines changed

scenarios/examples/spyre.sh

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,14 @@ export LLMDBENCH_VLLM_COMMON_EXTRA_PVC_NAME=spyre-precompiled-model
3030

3131
#export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=istio
3232

33+
#export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
34+
3335
export LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE=ibm.com/spyre_vf
3436
export LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM=4
3537
export LLMDBENCH_VLLM_COMMON_AFFINITY="ibm.com/spyre.product:IBM_Spyre"
3638
export LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS=1024
3739
export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=32768
40+
#export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=4096,,32768
3841
export LLMDBENCH_VLLM_COMMON_MAX_NUM_SEQ=32
3942
export LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS=1024
4043
export LLMDBENCH_VLLM_COMMON_CPU_NR=100
@@ -110,6 +113,7 @@ cat << EOF > $LLMDBENCH_VLLM_COMMON_EXTRA_VOLUME_MOUNTS
110113
- name: preprocesses
111114
mountPath: /setup/preprocess
112115
EOF
116+
113117
export LLMDBENCH_VLLM_COMMON_EXTRA_VOLUMES=$(mktemp)
114118
cat << EOF > $LLMDBENCH_VLLM_COMMON_EXTRA_VOLUMES
115119
- name: spyre-precompiled-model
@@ -134,7 +138,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
134138
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=$LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE
135139

136140
# Decode parameters: 2 decode pods
137-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
141+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
138142
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=${LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM}
139143
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=$LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE
140144
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=$LLMDBENCH_VLLM_COMMON_CPU_NR
@@ -151,13 +155,13 @@ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
151155
REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_PREPROCESS; \
152156
/home/senuser/container-scripts/simple_vllm_serve.sh /model-cache/models/REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
153157
--served-model-name REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL \
154-
--port REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT \
155-
--max-model-len REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN \
156-
--tensor-parallel-size REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM \
157-
--max-num-seqs REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_NUM_SEQ \
158+
--port \$VLLM_METRICS_PORT \
159+
--max-model-len \$VLLM_MAX_MODEL_LEN \
160+
--tensor-parallel-size \$VLLM_TENSOR_PARALLELISM \
161+
--max-num-seq \$VLLM_MAX_NUM_SEQ \
158162
--enable-auto-tool-choice \
159163
--tool-call-parser granite \
160-
--max-num-batched-tokens REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_NUM_BATCHED_TOKENS \
164+
--max-num-batched-tokens \$VLLM_MAX_NUM_BATCHED_TOKENS \
161165
--enable-prefix-caching
162166
EOF
163167

scenarios/guides/inference-scheduling.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
3636
#export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=data-science-gateway-class
3737
#export LLMDBENCH_VLLM_MODELSERVICE_INFERENCEPOOL_API=inference.networking.x-k8s.io/v1alpha2
3838

39+
#export LLMDBENCH_VLLM_MODELSERVICE_MULTINODE=true
3940

4041
# Routing configuration (via modelservice)
4142
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false")
@@ -127,7 +128,7 @@ EOF
127128
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
128129

129130
# Decode parameters: 2 decode pods
130-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=1
131+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
131132
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=$LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM
132133
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=$LLMDBENCH_VLLM_COMMON_CPU_NR
133134
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=$LLMDBENCH_VLLM_COMMON_CPU_MEM

setup/env.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ export LLMDBENCH_VLLM_COMMON_PREPROCESS=${LLMDBENCH_VLLM_COMMON_PREPROCESS:-/bin
176176

177177
# Standalone-specific parameters
178178
export LLMDBENCH_VLLM_COMMON_MODEL_LOADER_EXTRA_CONFIG=${LLMDBENCH_VLLM_COMMON_MODEL_LOADER_EXTRA_CONFIG:-"{}"}
179+
export LLMDBENCH_VLLM_STANDALONE_INFERENCE_PORT=${LLMDBENCH_VLLM_STANDALONE_INFERENCE_PORT:-${LLMDBENCH_VLLM_COMMON_INFERENCE_PORT}}
179180
export LLMDBENCH_VLLM_STANDALONE_PVC_MOUNTPOINT=${LLMDBENCH_VLLM_STANDALONE_PVC_MOUNTPOINT:-/model-storage}
180181
export LLMDBENCH_VLLM_STANDALONE_PREPROCESS=${LLMDBENCH_VLLM_COMMON_PREPROCESS}
181182
export LLMDBENCH_VLLM_STANDALONE_ROUTE=${LLMDBENCH_VLLM_STANDALONE_ROUTE:-1}

setup/functions.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1936,12 +1936,18 @@ def get_model_name_from_pod(api: pykube.HTTPClient,
19361936
curl_command = f"curl -k --no-progress-meter {ip}"
19371937
full_command = ["/bin/bash", "-c", f"{curl_command}"]
19381938

1939+
pull_secret_ref = None
1940+
if ev["vllm_common_pull_secret"] :
1941+
pull_secret_ref = client.V1LocalObjectReference(name=ev["vllm_common_pull_secret"])
1942+
19391943
while current_attempts <= total_attempts :
19401944
pod_name = f"testinference-pod-{get_rand_string()}"
1945+
19411946
pod_manifest = client.V1Pod(
19421947
metadata=client.V1ObjectMeta(name=pod_name, namespace=ev['vllm_common_namespace'], labels={"llm-d.ai/id": f"{pod_name}"}),
19431948
spec=client.V1PodSpec(
19441949
restart_policy="Never",
1950+
image_pull_secrets=[pull_secret_ref],
19451951
containers=[
19461952
client.V1Container(name="model", image=image, command=full_command)
19471953
],
@@ -2579,7 +2585,7 @@ def get_validation_param(ev: dict, type: str = COMMON) -> ValidationParam:
25792585
user_accelerator_nr, tp_size, dp_size
25802586
),
25812587
gpu_memory_util=float(ev[f"{prefix}_accelerator_mem_util"]),
2582-
max_model_len=int(ev["vllm_common_max_model_len"]),
2588+
max_model_len=int(ev["vllm_common_max_model_len"].split(',,')[0]),
25832589
)
25842590

25852591
return validation_param

setup/preprocess/set_llmdbench_environment.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,9 @@
405405
if is_infiniband :
406406
env_file_contents.append(f"export NVSHMEM_IB_ENABLE_IBGDA=\"{is_infiniband}\"")
407407

408-
lwswi = os.getenv("LWS_WORKER_INDEX", "0")
409-
dpsi = os.getenv("DP_SIZE_LOCAL", "0")
410-
sr = int(lwswi) * int(dpsi)
408+
lwswi = int(os.getenv("LWS_WORKER_INDEX", "0"))
409+
dpsi = int(os.getenv("DP_SIZE_LOCAL", "0"))
410+
sr = lwswi * dpsi
411411
env_file_contents.append(f"export START_RANK=\"{sr}\"")
412412

413413
env_file_contents.append("if [[ -z $LWS_WORKER_INDEX ]]; then")
@@ -438,6 +438,21 @@
438438
env_file_contents.append("fi")
439439

440440
env_file_contents.append("echo")
441+
442+
pod_name = os.uname()[1]
443+
if pod_name.count("decode") :
444+
pod_index=eval(pod_name.split('decode-')[-1].replace('-','+'))
445+
if pod_name.count("prefill") :
446+
pod_index=eval(pod_name.split('prefill-')[-1].replace('-','+'))
447+
448+
for key in dict(os.environ).keys():
449+
if "VLLM_" in key:
450+
value = os.environ.get(key)
451+
if value.count(',,') :
452+
newvalue = value.split(',,')[pod_index]
453+
print(f"INFO: Variable \"{key}\" with value \"{value}\" will be re-exported with \"{newvalue}\" ({pod_index})")
454+
env_file_contents.append(f"export {key}={newvalue}")
455+
441456
env_file_contents.append("echo \"Defined NCCL environment variables\"")
442457
env_file_contents.append("env | grep -E \"^NCCL|^UCX|^CUDA|^OMP|^NPROC|^SMOKETEST|^NVSHMEM|START|WORLD_SIZE|RANK|^MASTER\" | sort")
443458
env_file_contents.append("echo")

setup/steps/06_deploy_vllm_standalone_models.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,8 @@ def generate_deployment_yaml(ev, model, model_label):
256256
httpGet:
257257
path: {ev["vllm_standalone_startup_probe_path"]}
258258
port: {ev['vllm_common_inference_port']}
259-
failureThreshold: {ev["vllm_standalone_startup_probe_failure_threshold"]}
260-
initialDelaySeconds: {ev["vllm_standalone_startup_probe_initial_delay"]}
259+
failureThreshold: {ev["vllm_standalone_startup_probe_failure_threshold"]}
260+
initialDelaySeconds: {ev["vllm_standalone_startup_probe_initial_delay"]}
261261
periodSeconds: 30
262262
timeoutSeconds: 5
263263
livenessProbe:
@@ -309,10 +309,10 @@ def generate_deployment_yaml(ev, model, model_label):
309309
- containerPort: {ev['vllm_standalone_launcher_port']}
310310
startupProbe:
311311
httpGet:
312-
path: /health
313-
port: {ev['vllm_standalone_launcher_port']}
314-
failureThreshold: 200
315-
initialDelaySeconds: {ev.get('vllm_common_initial_delay_probe', 60)}
312+
path: {ev["vllm_standalone_startup_probe_path"]}
313+
port: {ev["vllm_standalone_inference_port"]}
314+
failureThreshold: {ev["vllm_standalone_startup_probe_failure_threshold"]}
315+
initialDelaySeconds: {ev["vllm_standalone_startup_probe_initial_delay"]}
316316
periodSeconds: 30
317317
timeoutSeconds: 5
318318
livenessProbe:
@@ -322,8 +322,8 @@ def generate_deployment_yaml(ev, model, model_label):
322322
periodSeconds: 10
323323
readinessProbe:
324324
httpGet:
325-
path: /health
326-
port: {ev['vllm_standalone_launcher_port']}
325+
path: {ev["vllm_common_readiness_probe_path"]}
326+
port: {ev["vllm_common_inference_port"]}
327327
failureThreshold: 3
328328
periodSeconds: 5
329329
resources:

setup/steps/10_smoketest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import pykube
99
import ipaddress
1010

11-
1211
# Add project root to path for imports
1312
current_file = Path(__file__).resolve()
1413
project_root = current_file.parents[1]

0 commit comments

Comments
 (0)