vllm spyre runtime model validation automation (#700)

edwardquarm · web-flow · commit 896ba742e4d2 · 2025-10-28T15:19:32.000-04:00
* feat(model-validation): vllm spyre runtime model validation automation

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

* feat(model-validation): add env variables, fix template issues

* feat(model-validation): add env variables, fix template issues

* feat(model-validation): setup testing ports. fix serving runtime

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

* feat(model-validation): inference port

* feat(model-validation): add progress deadline for serverless deployment

* feat(model-validation): get model name before sending request

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

* feat(model-validation): skip serverless tests for spyre

* feat(model-validation): skip serverless tests for spyre

* feat(model-validation): skip serverless tests for spyre

* feat(model-validation): address PR comments

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

* feat(spyre-model-validation): use template on RHOAI

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

* feat(spyre-model-validation): use template on RHOAI

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;

---------

Signed-off-by: Edward Arthur Quarm Jnr &lt;equarmjn@redhat.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,4 +1,5 @@
 import base64
+import binascii
 import os
 import shutil
 from ast import literal_eval
@@ -157,7 +158,11 @@ def registry_pull_secret(pytestconfig: Config) -> str:
             "Registry pull secret is not set. "
             "Either pass with `--registry_pull_secret` or set `OCI_REGISTRY_PULL_SECRET` environment variable"
         )
-    return registry_pull_secret
+    try:
+        base64.b64decode(s=registry_pull_secret, validate=True)
+        return registry_pull_secret
+    except binascii.Error:
+        raise ValueError("Registry pull secret is not a valid base64 encoded string")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/model_serving/model_runtime/model_validation/conftest.py b/tests/model_serving/model_runtime/model_validation/conftest.py
@@ -31,6 +31,7 @@
 from utilities.constants import KServeDeploymentType, Labels, RuntimeTemplates
 from utilities.inference_utils import create_isvc
 from utilities.serving_runtime import ServingRuntimeFromTemplate
+
 from simple_logger.logger import get_logger
 
 LOGGER = get_logger(name=__name__)
@@ -45,6 +46,7 @@ def model_car_serving_runtime(
     vllm_runtime_image: str,
 ) -> Generator[ServingRuntime, None, None]:
     accelerator_type = supported_accelerator_type.lower()
+
     template_name = TEMPLATE_MAP.get(accelerator_type, RuntimeTemplates.VLLM_CUDA)
     LOGGER.info(f"using template: {template_name}")
     assert model_namespace.name is not None
@@ -88,6 +90,20 @@ def vllm_model_car_inference_service(
     resources["limits"][identifier] = gpu_count
     isvc_kwargs["resources"] = resources
 
+    if (
+        identifier == Labels.Spyre.SPYRE_COM_GPU
+        and deployment_config.get("deployment_type") == KServeDeploymentType.SERVERLESS
+    ):
+        pytest.skip("Spyre cluster is not setup with TLS/mTLS")
+    if identifier == Labels.Spyre.SPYRE_COM_GPU:
+        isvc_kwargs["scheduler_name"] = "spyre-scheduler"
+        resources["requests"] = {
+            "ibm.com/spyre_pf": gpu_count,
+        }
+        resources["limits"] = {
+            "ibm.com/spyre_pf": gpu_count,
+        }
+
     if timeout:
         isvc_kwargs["timeout"] = timeout
 
@@ -131,6 +147,7 @@ def kserve_registry_pull_secret(
             "ACCESS_TYPE": PULL_SECRET_ACCESS_TYPE,
             "OCI_HOST": registry_host,
         },
+        type="kubernetes.io/dockerconfigjson",
         wait_for_resource=True,
     ) as secret:
         yield secret
@@ -257,7 +274,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     if not isinstance(model_car_data, list):
         raise ValueError("Invalid format for `model-car` in YAML. Expected a list of objects.")
 
-    # Check if metafunc.cls is not None to avoid linter errors
     if not metafunc.cls:
         return
 
diff --git a/tests/model_serving/model_runtime/model_validation/constant.py b/tests/model_serving/model_runtime/model_validation/constant.py
@@ -7,12 +7,14 @@
     AcceleratorType.NVIDIA: Labels.Nvidia.NVIDIA_COM_GPU,
     AcceleratorType.AMD: "amd.com/gpu",
     AcceleratorType.GAUDI: "habana.ai/gaudi",
+    AcceleratorType.SPYRE: Labels.Spyre.SPYRE_COM_GPU,
 }
 
 TEMPLATE_MAP: dict[str, str] = {
     AcceleratorType.NVIDIA: RuntimeTemplates.VLLM_CUDA,
     AcceleratorType.AMD: RuntimeTemplates.VLLM_ROCM,
-    AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDUI,
+    AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDI,
+    AcceleratorType.SPYRE: RuntimeTemplates.VLLM_SPYRE,
 }
 
 
@@ -71,10 +73,10 @@
     ],
 ]
 
-PULL_SECRET_ACCESS_TYPE: str = "WyJQdWxsIl0="  # Base64 encoded value for "Pull"
+PULL_SECRET_ACCESS_TYPE: str = '["Pull"]'
 PULL_SECRET_NAME: str = "oci-registry-pull-secret"
-INFERENCE_SERVICE_PORT: int = 8080
-CONTAINER_PORT: int = 8080
+SPYRE_INFERENCE_SERVICE_PORT: int = 8000
+SPYRE_CONTAINER_PORT: int = 8000
 TIMEOUT_20MIN: int = 30 * 60
 OPENAI_ENDPOINT_NAME: str = "openai"
 TGIS_ENDPOINT_NAME: str = "tgis"
diff --git a/tests/model_serving/model_runtime/model_validation/sample_modelcar_config.yaml b/tests/model_serving/model_runtime/model_validation/sample_modelcar_config.yaml
@@ -1,4 +1,15 @@
 model-car:
+  - name: granite-3.1-8b-instruct
+    image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-instruct:1.5
+    model_output_type: text
+    serving_arguments:
+      args:
+        - "--uvicorn-log-level=info"
+        - "--max-model-len=2048"
+        - "--trust-remote-code"
+        - "--distributed-executor-backend=mp"
+      gpu_count: 1
+
   - name: granite-3.1-8b-base-quantized.w4a16
     image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-base-quantized-w4a16:1.5
     model_output_type: text
diff --git a/tests/model_serving/model_runtime/model_validation/utils.py b/tests/model_serving/model_runtime/model_validation/utils.py
@@ -1,6 +1,5 @@
 import re
 from typing import Any
-
 from tests.model_serving.model_runtime.vllm.constant import VLLM_SUPPORTED_QUANTIZATION
 
 
diff --git a/tests/model_serving/model_runtime/utils.py b/tests/model_serving/model_runtime/utils.py
@@ -10,6 +10,7 @@
     OPENAI_ENDPOINT_NAME,
     AUDIO_FILE_URL,
     AUDIO_FILE_LOCAL_PATH,
+    SPYRE_INFERENCE_SERVICE_PORT,
 )
 from utilities.constants import Ports
 from utilities.exceptions import NotSupportedError
@@ -145,13 +146,14 @@ def validate_raw_openai_inference_request(
     completion_query: list[dict[str, str]],
     model_output_type: str,
     model_name: str,
+    port: int = Ports.REST_PORT,
 ) -> None:
     if model_output_type == "audio":
         LOGGER.info("Running audio inference test")
         model_info, completion_responses = run_audio_inference(
             pod_name=pod_name,
             isvc=isvc,
-            port=Ports.REST_PORT,
+            port=port,
             endpoint=OPENAI_ENDPOINT_NAME,
             model_name=model_name,
         )
@@ -161,10 +163,13 @@ def validate_raw_openai_inference_request(
         return
     elif model_output_type == "text":
         LOGGER.info("Running text inference test")
+        scheduler_name = getattr(isvc.instance.spec.predictor, "schedulerName", "") or ""
+        if scheduler_name.lower() == "spyre-scheduler":
+            port = SPYRE_INFERENCE_SERVICE_PORT
         model_info, completion_responses = run_raw_inference(
             pod_name=pod_name,
             isvc=isvc,
-            port=Ports.REST_PORT,
+            port=port,
             endpoint=OPENAI_ENDPOINT_NAME,
             completion_query=completion_query,
         )
@@ -205,6 +210,8 @@ def fetch_openai_response(
     model_name: str,
     completion_query: list[dict[str, str]] | None = None,
 ) -> tuple[Any, list[Any]]:
+    model_info = OpenAIClient.get_request_http(host=url, endpoint=OpenAIEnpoints.MODELS_INFO)
+    model_name = model_info[0]["id"] if model_info else model_name
     if completion_query is None:
         completion_query = COMPLETION_QUERY
     completion_responses = []
@@ -216,7 +223,6 @@ def fetch_openai_response(
             )
             completion_responses.append(completion_response)
 
-    model_info = OpenAIClient.get_request_http(host=url, endpoint=OpenAIEnpoints.MODELS_INFO)
     return model_info, completion_responses
 
 
diff --git a/tests/model_serving/model_runtime/vllm/constant.py b/tests/model_serving/model_runtime/vllm/constant.py
@@ -15,7 +15,7 @@
 TEMPLATE_MAP: dict[str, str] = {
     AcceleratorType.NVIDIA: RuntimeTemplates.VLLM_CUDA,
     AcceleratorType.AMD: RuntimeTemplates.VLLM_ROCM,
-    AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDUI,
+    AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDI,
 }
 
 PREDICT_RESOURCES: dict[str, Union[list[dict[str, Union[str, dict[str, str]]]], dict[str, dict[str, str]]]] = {
diff --git a/utilities/constants.py b/utilities/constants.py
@@ -74,7 +74,8 @@ class RuntimeTemplates:
     TGIS_GRPC_SERVING: str = "tgis-grpc-serving-template"
     VLLM_CUDA: str = "vllm-cuda-runtime-template"
     VLLM_ROCM: str = "vllm-rocm-runtime-template"
-    VLLM_GAUDUI: str = "vllm-gaudi-runtime-template"
+    VLLM_GAUDI: str = "vllm-gaudi-runtime-template"
+    VLLM_SPYRE: str = "vllm-spyre-x86-runtime-template"
     MLSERVER_GRPC: str = "mlserver-grpc-runtime-template"
     MLSERVER_REST: str = "mlserver-rest-runtime-template"
     TRITON_REST: str = "triton-rest-runtime-template"
@@ -123,7 +124,8 @@ class AcceleratorType:
     NVIDIA: str = "nvidia"
     AMD: str = "amd"
     GAUDI: str = "gaudi"
-    SUPPORTED_LISTS: list[str] = [NVIDIA, AMD, GAUDI]
+    SPYRE: str = "spyre"
+    SUPPORTED_LISTS: list[str] = [NVIDIA, AMD, GAUDI, SPYRE]
 
 
 class ApiGroups:
@@ -212,6 +214,9 @@ class Nvidia:
     class ROCm:
         ROCM_GPU: str = "amd.com/gpu"
 
+    class Spyre:
+        SPYRE_COM_GPU: str = "ibm.com/spyre_pf"
+
     class Kueue:
         MANAGED: str = "kueue.openshift.io/managed"
         QUEUE_NAME: str = "kueue.x-k8s.io/queue-name"
diff --git a/utilities/inference_utils.py b/utilities/inference_utils.py
@@ -583,6 +583,7 @@ def create_isvc(
     protocol_version: str | None = None,
     labels: dict[str, str] | None = None,
     auto_scaling: dict[str, Any] | None = None,
+    scheduler_name: str | None = None,
 ) -> Generator[InferenceService, Any, Any]:
     """
     Create InferenceService object.
@@ -618,6 +619,7 @@ def create_isvc(
         teardown (bool): Teardown
         protocol_version (str): Protocol version of the model server
         auto_scaling (dict[str, Any]): Auto scaling configuration for the model
+        scheduler_name (str): Scheduler name
 
     Yields:
         InferenceService: InferenceService object
@@ -712,6 +714,9 @@ def create_isvc(
     if protocol_version is not None:
         predictor_dict["model"]["protocolVersion"] = protocol_version
 
+    if scheduler_name is not None:
+        predictor_dict["schedulerName"] = scheduler_name
+
     with InferenceService(
         client=client,
         name=name,

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`TEMPLATE_MAP: dict[str, str] = {`
`16`	`16`	`AcceleratorType.NVIDIA: RuntimeTemplates.VLLM_CUDA,`
`17`	`17`	`AcceleratorType.AMD: RuntimeTemplates.VLLM_ROCM,`
`18`		`- AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDUI,`
	`18`	`+ AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDI,`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`PREDICT_RESOURCES: dict[str, Union[list[dict[str, Union[str, dict[str, str]]]], dict[str, dict[str, str]]]] = {`