fix component_nr

kalantar · kalantar · commit d353d8900e30 · 2025-12-17T13:43:01.000-05:00
Signed-off-by: Michael Kalantar &lt;kalantar@us.ibm.com&gt;
diff --git a/scenarios/guides/wide-ep-lws.sh b/scenarios/guides/wide-ep-lws.sh
@@ -1,5 +1,3 @@
-export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
-
 # WIDE EP/DP WITH LWS WELL LIT PATH
 # Based on https://github.com/llm-d/llm-d/tree/main/guides/wide-ep-lws/README.md
 # Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
@@ -21,6 +19,14 @@ export LLMDBENCH_DEPLOY_MODEL_LIST="deepseek-ai/DeepSeek-R1-0528"
 #export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
 export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
 
+# gateway configuration
+###### default is istio and NodePort
+# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_CLASS_NAME=kgateway
+###### on openshift as alternative to (default) NodePort
+# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=ClusterIP
+###### if support LoadBalancer
+# export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=LoadBalancer
+
 # Routing configuration (via gaie)
 export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="custom-plugins.yaml"
 export LLMDBENCH_VLLM_MODELSERVICE_GAIE_CUSTOM_PLUGINS=$(mktemp)
@@ -49,6 +55,25 @@ custom-plugins.yaml: |
     - pluginRef: decode-filter
     - pluginRef: random-picker
 EOF
+export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL_PROVIDER_CONFIG=$(mktemp)
+cat << EOF > $LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL_PROVIDER_CONFIG
+destinationRule:
+  host: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL-gaie-epp
+  trafficPolicy:
+    tls:
+      mode: SIMPLE
+      insecureSkipVerify: true
+    connectionPool:
+      http:
+        http1MaxPendingRequests: 256000
+        maxRequestsPerConnection: 256000
+        http2MaxRequests: 256000
+        idleTimeout: "900s"
+      tcp:
+        maxConnections: 256000
+        maxConnectionDuration: "1800s"
+        connectTimeout: "900s"
+EOF
 
 # Routing configuration (via modelservice)
 # export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
diff --git a/setup/env.sh b/setup/env.sh
@@ -192,7 +192,7 @@ export LLMDBENCH_VLLM_MODELSERVICE_GATEWAY_SERVICE_TYPE=${LLMDBENCH_VLLM_MODELSE
 export LLMDBENCH_VLLM_MODELSERVICE_ROUTE=${LLMDBENCH_VLLM_MODELSERVICE_ROUTE:-false}
 # Endpoint Picker Parameters
 export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE=${LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE:-"default-plugins.yaml"}
-
+export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL_PROVIDER_CONFIG=${LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_POOL_PROVIDER_CONFIG:-""}
 export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=${LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR:-"nixlv2"}
 export LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL=${LLMDBENCH_LLMD_ROUTINGSIDECAR_DEBUG_LEVEL:-3}
 
diff --git a/setup/steps/07_deploy_setup.py b/setup/steps/07_deploy_setup.py
@@ -17,26 +17,25 @@ def gateway_values(provider : str, host: str, service: str) -> str:
     if provider == "istio":
         return f"""gateway:
   gatewayClassName: istio
-  service:
-    type: {service}
-  destinationRule:
+  gatewayParameters:
     enabled: true
-    trafficPolicy:
-      tls:
-        mode: SIMPLE
-        insecureSkipVerify: true
-    host: {host}"""
+  service:
+    type: {service}"""
 
     elif provider == "kgateway":
         return f"""gateway:
   gatewayClassName: kgateway
+  """
+    
+    elif provider == "kgateway-openshift":
+        return f"""gateway:
+  gatewayClassName: kgateway
   service:
     type: {service}
-#  destinationRule:
-#    host: {host}
   gatewayParameters:
     enabled: true
   """
+        
     elif provider == "gke":
         return f"""gateway:
   gatewayClassName: gke-l7-regional-external-managed
@@ -165,7 +164,10 @@ def main():
             # Create infra values file
             infra_value_file = Path(helm_base_dir / "infra.yaml" )
             with open(infra_value_file, 'w') as f:
-                f.write(gateway_values(ev['vllm_modelservice_gateway_class_name'], f"{model_id_label}-gaie-epp.{ev['vllm_common_namespace']}{ev['vllm_common_fqdn']}", ev["vllm_modelservice_gateway_service_type"]))
+                gw_class = ev['vllm_modelservice_gateway_class_name']
+                if gw_class == 'kgateway' and ev['control_deploy_is_openshift']:
+                    gw_class = f"{gw_class}-openshift"
+                f.write(gateway_values(gw_class, f"{model_id_label}-gaie-epp.{ev['vllm_common_namespace']}{ev['vllm_common_fqdn']}", ev["vllm_modelservice_gateway_service_type"]))
 
             os.environ["LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL"] = model_id_label
 
@@ -250,7 +252,7 @@ def main():
             exit(result)
         announce(f"✅ chart \"infra-{ev['vllm_modelservice_release']}\" deployed successfully")
 
-        announce("✅ Completed gaie deployment")
+        announce("✅ Completed gateway deployment")
     else:
         deploy_methods = ev["deploy_methods"]
         announce(f"⏭️ Environment types are \"{deploy_methods}\". Skipping this step.")
diff --git a/setup/steps/08_deploy_gaie.py b/setup/steps/08_deploy_gaie.py
@@ -125,6 +125,9 @@ def main():
         secretKeyRef:
           name: {ev["vllm_common_hf_token_name"]}
           key: {ev["vllm_common_hf_token_key"]}"""
+                
+            gaie_provider = provider(ev['vllm_modelservice_gateway_class_name'])
+            ip_provider_config = ev.get("vllm_modelservice_inference_pool_provider_config", "")
 
             # Generate GAIE values YAML content
             gaie_values_content = f"""inferenceExtension:
@@ -155,8 +158,17 @@ def main():
     matchLabels:
       llm-d.ai/inferenceServing: "true"
       llm-d.ai/model: {model_id_label}
-provider:
-  name: {provider(ev['vllm_modelservice_gateway_class_name'])}
+"""
+            if gaie_provider != "none" or ip_provider_config != "":
+                gaie_values_content = f"""{gaie_values_content}
+provider:"""
+            if gaie_provider != "none":
+                gaie_values_content = f"""{gaie_values_content}
+  name: {gaie_provider}
+"""
+            if ip_provider_config != "":
+                gaie_values_content = f"""{gaie_values_content}
+  {add_config(ip_provider_config, 4, f"{ev['vllm_modelservice_gateway_class_name']}:").lstrip()}
 """
             # Write GAIE values file
             gaie_values_file = helm_dir / "gaie-values.yaml"
@@ -218,7 +230,7 @@ def main():
 
             model_number += 1
 
-        announce("✅ Completed model deployment")
+        announce("✅ Completed GAIE deployment")
     else:
         announce(f"⏭️ Environment types are \"{ev['deploy_methods']}\". Skipping this step.")
 
diff --git a/setup/steps/09_deploy_via_modelservice.py b/setup/steps/09_deploy_via_modelservice.py
@@ -291,7 +291,7 @@ def generate_ms_values_yaml(
       readinessProbe:
         httpGet:
           path: /health
-          port: {decode_inference_port}
+          port: {common_inference_port}
         failureThreshold: 3
         periodSeconds: 5
       {add_config(decode_extra_container_config, 6).lstrip()}
@@ -351,7 +351,7 @@ def generate_ms_values_yaml(
       readinessProbe:
         httpGet:
           path: /health
-          port: {prefill_inference_port}
+          port: {common_inference_port}
         failureThreshold: 3
         periodSeconds: 5
       {add_config(prefill_extra_container_config, 6).lstrip()}
@@ -561,15 +561,21 @@ def main():
 
         # Wait for decode pods to be created, running, and ready
         api_client = client.CoreV1Api()
+        expected_num_decode_pods = ev["vllm_modelservice_decode_replicas"]
+        if ev.get("vllm_modelservice_multinode", "false"):
+            expected_num_decode_pods = int(ev.get("vllm_modelservice_decode_num_workers_parallelism", "1")) * int(expected_num_decode_pods)
         result = wait_for_pods_created_running_ready(
-            api_client, ev, ev["vllm_modelservice_decode_replicas"], "decode"
+            api_client, ev, expected_num_decode_pods, "decode"
         )
         if result != 0:
             return result
 
         # Wait for prefill pods to be created, running, and ready
+        expected_num_prefill_pods = ev["vllm_modelservice_prefill_replicas"]
+        if ev.get("vllm_modelservice_multinode", "false"):
+            expected_num_prefill_pods = int(ev.get("vllm_modelservice_prefill_num_workers_parallelism", "1")) * int(expected_num_prefill_pods)
         result = wait_for_pods_created_running_ready(
-            api_client, ev, ev["vllm_modelservice_prefill_replicas"], "prefill"
+            api_client, ev, expected_num_prefill_pods, "prefill"
         )
         if result != 0:
             return result