Add simple test for OCI on multi-node/multi-gpu runtime (#356)

israel-hdez · pre-commit-ci[bot] · web-flow · commit a73fc25ae01f · 2025-06-25T10:30:10.000-06:00
* Add simple test for OCI on multi-node/multi-gpu runtime

The new test ensures that OCI support in multi-node/multi-gpu is working properly. This is done by deploying basic InferenceService with an OCI image in modelcar format.

Since the storage is the only different variable, at the moment, we would trust the tests in test_nvidia_multi_node.py are providing proper coverage for the rest of the features.

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/tests/model_serving/model_server/multi_node/conftest.py b/tests/model_serving/model_server/multi_node/conftest.py
@@ -17,7 +17,7 @@
 from tests.model_serving.model_server.multi_node.utils import (
     delete_multi_node_pod_by_role,
 )
-from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout
+from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout, ModelCarImage
 from utilities.general import download_model_data
 from utilities.inference_utils import create_isvc
 from utilities.infra import (
@@ -110,6 +110,58 @@ def multi_node_inference_service(
         yield isvc
 
 
+@pytest.fixture(scope="class")
+def multi_node_oci_inference_service(
+    request: FixtureRequest,
+    unprivileged_client: DynamicClient,
+    multi_node_serving_runtime: ServingRuntime,
+) -> Generator[InferenceService, Any, Any]:
+    resources = {
+        "requests": {
+            "cpu": "1",
+            "memory": "4G",
+        },
+        "limits": {
+            "cpu": "2",
+            "memory": "12G",
+        },
+    }
+
+    worker_resources = {
+        "containers": [
+            {
+                "name": "worker-container",
+                "resources": resources,
+            }
+        ]
+    }
+
+    # NOTE: In KServe v0.15, the autoscaler_mode needs to be updated to "none".
+    with create_isvc(
+        client=unprivileged_client,
+        name=request.param["name"],
+        namespace=multi_node_serving_runtime.namespace,
+        runtime=multi_node_serving_runtime.name,
+        storage_uri=ModelCarImage.GRANITE_8B_CODE_INSTRUCT,
+        model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
+        deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
+        autoscaler_mode="external",
+        resources=resources,
+        multi_node_worker_spec=worker_resources,
+        wait_for_predictor_pods=False,
+        external_route=True,
+        timeout=Timeout.TIMEOUT_30MIN,
+    ) as isvc:
+        wait_for_inference_deployment_replicas(
+            client=unprivileged_client,
+            isvc=isvc,
+            expected_num_deployments=2,
+            runtime_name=multi_node_serving_runtime.name,
+            timeout=Timeout.TIMEOUT_15MIN,
+        )
+        yield isvc
+
+
 @pytest.fixture(scope="class")
 def multi_node_predictor_pods_scope_class(
     unprivileged_client: DynamicClient,
diff --git a/tests/model_serving/model_server/multi_node/test_oci_multi_node.py b/tests/model_serving/model_server/multi_node/test_oci_multi_node.py
@@ -0,0 +1,35 @@
+import pytest
+from simple_logger.logger import get_logger
+
+from tests.model_serving.model_server.utils import verify_inference_response
+from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
+from utilities.constants import Protocols
+
+pytestmark = [
+    pytest.mark.rawdeployment,
+    pytest.mark.usefixtures("skip_if_no_gpu_nodes"),
+]
+
+LOGGER = get_logger(name=__name__)
+
+
+@pytest.mark.parametrize(
+    "unprivileged_model_namespace, multi_node_oci_inference_service",
+    [
+        pytest.param(
+            {"name": "gpu-oci-multi-node"},
+            {"name": "multi-oci-vllm"},
+        )
+    ],
+    indirect=True,
+)
+class TestOciMultiNode:
+    def test_oci_multi_node_basic_external_inference(self, multi_node_oci_inference_service):
+        """Test multi node basic inference"""
+        verify_inference_response(
+            inference_service=multi_node_oci_inference_service,
+            inference_config=VLLM_INFERENCE_CONFIG,
+            inference_type="completions",
+            protocol=Protocols.HTTPS,
+            use_default_query=True,
+        )
diff --git a/utilities/constants.py b/utilities/constants.py
@@ -211,6 +211,7 @@ class Timeout:
     TIMEOUT_10MIN: int = 10 * TIMEOUT_1MIN
     TIMEOUT_15MIN: int = 15 * TIMEOUT_1MIN
     TIMEOUT_20MIN: int = 20 * TIMEOUT_1MIN
+    TIMEOUT_30MIN: int = 30 * TIMEOUT_1MIN
 
 
 class OpenshiftRouteTimeout:
@@ -232,6 +233,7 @@ class ModelCarImage:
     MNIST_8_1: str = (
         "oci://quay.io/mwaykole/test@sha256:8a3217bcfa2cc5fa3d07496cff8b234acdf2c9725dd307dc0a80401f55e1a11c"  # noqa: E501
     )
+    GRANITE_8B_CODE_INSTRUCT: str = "oci://registry.redhat.io/rhelai1/modelcar-granite-8b-code-instruct:1.4"
 
 
 class MinIo:
diff --git a/utilities/inference_utils.py b/utilities/inference_utils.py
@@ -568,7 +568,7 @@ def create_isvc(
     model_version: str | None = None,
     wait_for_predictor_pods: bool = True,
     autoscaler_mode: str | None = None,
-    multi_node_worker_spec: dict[str, int] | None = None,
+    multi_node_worker_spec: dict[str, Any] | None = None,
     timeout: int = Timeout.TIMEOUT_15MIN,
     scale_metric: str | None = None,
     scale_target: int | None = None,
@@ -603,7 +603,7 @@ def create_isvc(
         model_version (str): Model version
         wait_for_predictor_pods (bool): Wait for predictor pods
         autoscaler_mode (str): Autoscaler mode
-        multi_node_worker_spec (dict[str, int]): Multi node worker spec
+        multi_node_worker_spec (dict[str, Any]): Multi node worker spec
         timeout (int): Time to wait for the model inference,deployment to be ready
         scale_metric (str): Scale metric
         scale_target (int): Scale target
diff --git a/utilities/manifests/vllm.py b/utilities/manifests/vllm.py
@@ -1,7 +1,7 @@
 VLLM_INFERENCE_CONFIG = {
         "default_query_model": {
             "query_input": '"prompt": "At what temperature does Nitrogen boil?", "max_tokens": 100, "temperature": 0',
-            "query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4K.*","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}\],"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}}',
+            "query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4.*","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}\],"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}}',
             "use_regex": True
         },
         "completions": {