spyre fixes

kalantar · kalantar · commit 57516ac35b83 · 2025-11-25T13:44:44.000-05:00
Signed-off-by: Michael Kalantar &lt;kalantar@us.ibm.com&gt;
diff --git a/scenarios/examples/spyre.sh b/scenarios/examples/spyre.sh
@@ -90,8 +90,6 @@ cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
   value: '1024,256'
 - name: DTCOMPILER_KEEP_EXPORT
   value: 'true'
-- name: TENSOR_PARALLEL_SIZE
-  value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM"
 - name: PORT
   value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_INFERENCE_PORT"
 - name: DTCOMPILER_KEEP_EXPORT
@@ -117,7 +115,7 @@ EOF
 # Prefill parameters: 0 prefill pod
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
 export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_RESOURCE=ibm.com/spyre_pf
-export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0
+# export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0
 
 # Decode parameters: 2 decode pods
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
@@ -136,14 +134,14 @@ cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS
 /home/senuser/container-scripts/simple_vllm_serve.sh REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL;  \
 --port REPLACE_ENV_LLMDBENCH_VLLM_COMMON_INFERENCE_PORT \
 --max-model-len REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN \
---tensor-parallel-size REPLACE_ENV_LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM \
+--tensor-parallel-size \$TP_SIZE \
 --max-num-seqs 32 \
 --enable-auto-tool-choice \
 --tool-call-parser granite
 EOF
 
 export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_RESOURCE=ibm.com/spyre_pf
-export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=1
+# export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=1
 
 # Workload parameters
 
diff --git a/setup/functions.py b/setup/functions.py
@@ -1238,20 +1238,6 @@ def add_resources(ev:dict, identifier: str) -> [str, str]:
         identifier = f"modelservice_{identifier}"
         section_indent = " " * 8
 
-    accelerator_resource = ev[f"vllm_{identifier}_accelerator_resource"]
-
-    if accelerator_resource == "auto":
-        accelerator_resource = "nvidia.com/gpu"
-
-    accelerator_nr = ev[f"vllm_{identifier}_accelerator_nr"]
-
-    data_parallelism = ev[f"vllm_{identifier}_data_parallelism"]
-    tensor_parallelism = ev[f"vllm_{identifier}_tensor_parallelism"]
-
-    accelerator_count = get_accelerator_nr(
-        accelerator_nr, tensor_parallelism, data_parallelism
-    )
-
     cpu_mem = ev[f"vllm_{identifier}_cpu_mem"]
     cpu_nr = ev[f"vllm_{identifier}_cpu_nr"]
 
@@ -1278,26 +1264,6 @@ def add_resources(ev:dict, identifier: str) -> [str, str]:
             f'{section_indent}{ephemeral_storage_resource}: "{ephemeral_storage_nr}"'
         )
 
-    if (
-        accelerator_resource
-        and accelerator_count
-        and str(accelerator_count) != "0"
-    ):
-        limits_resources.append(
-            f'{section_indent}{accelerator_resource}: "{accelerator_count}"'
-        )
-        requests_resources.append(
-            f'{section_indent}{accelerator_resource}: "{accelerator_count}"'
-        )
-
-    if accelerator_resource != "nvidia.com/gpu" :
-        limits_resources.append(
-            f'{section_indent}nvidia.com/gpu: "0"'
-        )
-        requests_resources.append(
-            f'{section_indent}nvidia.com/gpu: "0"'
-        )
-
     if network_resource and network_nr:
         limits_resources.append(
             f'{section_indent}{network_resource}: "{network_nr}"'
diff --git a/setup/steps/09_deploy_via_modelservice.py b/setup/steps/09_deploy_via_modelservice.py
@@ -25,7 +25,6 @@
     get_image,
     add_command,
     add_command_line_options,
-    get_accelerator_nr,
     add_annotations,
     add_additional_env_to_yaml,
     add_config,
@@ -236,10 +235,11 @@ def generate_ms_values_yaml(
     connector: {proxy_connector}
     debugLevel: {proxy_debug_level}
 
+{add_affinity(ev, "")}
+
 decode:
   create: {decode_create}
   replicas: {decode_replicas}
-{add_affinity(ev, "  ")}
   parallelism:
     data: {decode_data_parallelism}
     tensor: {decode_tensor_parallelism}
@@ -294,7 +294,6 @@ def generate_ms_values_yaml(
 prefill:
   create: {prefill_create}
   replicas: {prefill_replicas}
-{add_affinity(ev, "  ")}
   parallelism:
     data: {prefill_data_parallelism}
     tensor: {prefill_tensor_parallelism}