Skip to content

Commit a73fc25

Browse files
Add simple test for OCI on multi-node/multi-gpu runtime (#356)
* Add simple test for OCI on multi-node/multi-gpu runtime The new test ensures that OCI support in multi-node/multi-gpu is working properly. This is done by deploying basic InferenceService with an OCI image in modelcar format. Since the storage is the only different variable, at the moment, we would trust the tests in test_nvidia_multi_node.py are providing proper coverage for the rest of the features. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent db67d52 commit a73fc25

File tree

5 files changed

+93
-4
lines changed

5 files changed

+93
-4
lines changed

tests/model_serving/model_server/multi_node/conftest.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from tests.model_serving.model_server.multi_node.utils import (
1818
delete_multi_node_pod_by_role,
1919
)
20-
from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout
20+
from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout, ModelCarImage
2121
from utilities.general import download_model_data
2222
from utilities.inference_utils import create_isvc
2323
from utilities.infra import (
@@ -110,6 +110,58 @@ def multi_node_inference_service(
110110
yield isvc
111111

112112

113+
@pytest.fixture(scope="class")
114+
def multi_node_oci_inference_service(
115+
request: FixtureRequest,
116+
unprivileged_client: DynamicClient,
117+
multi_node_serving_runtime: ServingRuntime,
118+
) -> Generator[InferenceService, Any, Any]:
119+
resources = {
120+
"requests": {
121+
"cpu": "1",
122+
"memory": "4G",
123+
},
124+
"limits": {
125+
"cpu": "2",
126+
"memory": "12G",
127+
},
128+
}
129+
130+
worker_resources = {
131+
"containers": [
132+
{
133+
"name": "worker-container",
134+
"resources": resources,
135+
}
136+
]
137+
}
138+
139+
# NOTE: In KServe v0.15, the autoscaler_mode needs to be updated to "none".
140+
with create_isvc(
141+
client=unprivileged_client,
142+
name=request.param["name"],
143+
namespace=multi_node_serving_runtime.namespace,
144+
runtime=multi_node_serving_runtime.name,
145+
storage_uri=ModelCarImage.GRANITE_8B_CODE_INSTRUCT,
146+
model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
147+
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
148+
autoscaler_mode="external",
149+
resources=resources,
150+
multi_node_worker_spec=worker_resources,
151+
wait_for_predictor_pods=False,
152+
external_route=True,
153+
timeout=Timeout.TIMEOUT_30MIN,
154+
) as isvc:
155+
wait_for_inference_deployment_replicas(
156+
client=unprivileged_client,
157+
isvc=isvc,
158+
expected_num_deployments=2,
159+
runtime_name=multi_node_serving_runtime.name,
160+
timeout=Timeout.TIMEOUT_15MIN,
161+
)
162+
yield isvc
163+
164+
113165
@pytest.fixture(scope="class")
114166
def multi_node_predictor_pods_scope_class(
115167
unprivileged_client: DynamicClient,
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import pytest
2+
from simple_logger.logger import get_logger
3+
4+
from tests.model_serving.model_server.utils import verify_inference_response
5+
from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
6+
from utilities.constants import Protocols
7+
8+
pytestmark = [
9+
pytest.mark.rawdeployment,
10+
pytest.mark.usefixtures("skip_if_no_gpu_nodes"),
11+
]
12+
13+
LOGGER = get_logger(name=__name__)
14+
15+
16+
@pytest.mark.parametrize(
17+
"unprivileged_model_namespace, multi_node_oci_inference_service",
18+
[
19+
pytest.param(
20+
{"name": "gpu-oci-multi-node"},
21+
{"name": "multi-oci-vllm"},
22+
)
23+
],
24+
indirect=True,
25+
)
26+
class TestOciMultiNode:
27+
def test_oci_multi_node_basic_external_inference(self, multi_node_oci_inference_service):
28+
"""Test multi node basic inference"""
29+
verify_inference_response(
30+
inference_service=multi_node_oci_inference_service,
31+
inference_config=VLLM_INFERENCE_CONFIG,
32+
inference_type="completions",
33+
protocol=Protocols.HTTPS,
34+
use_default_query=True,
35+
)

utilities/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ class Timeout:
211211
TIMEOUT_10MIN: int = 10 * TIMEOUT_1MIN
212212
TIMEOUT_15MIN: int = 15 * TIMEOUT_1MIN
213213
TIMEOUT_20MIN: int = 20 * TIMEOUT_1MIN
214+
TIMEOUT_30MIN: int = 30 * TIMEOUT_1MIN
214215

215216

216217
class OpenshiftRouteTimeout:
@@ -232,6 +233,7 @@ class ModelCarImage:
232233
MNIST_8_1: str = (
233234
"oci://quay.io/mwaykole/test@sha256:8a3217bcfa2cc5fa3d07496cff8b234acdf2c9725dd307dc0a80401f55e1a11c" # noqa: E501
234235
)
236+
GRANITE_8B_CODE_INSTRUCT: str = "oci://registry.redhat.io/rhelai1/modelcar-granite-8b-code-instruct:1.4"
235237

236238

237239
class MinIo:

utilities/inference_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ def create_isvc(
568568
model_version: str | None = None,
569569
wait_for_predictor_pods: bool = True,
570570
autoscaler_mode: str | None = None,
571-
multi_node_worker_spec: dict[str, int] | None = None,
571+
multi_node_worker_spec: dict[str, Any] | None = None,
572572
timeout: int = Timeout.TIMEOUT_15MIN,
573573
scale_metric: str | None = None,
574574
scale_target: int | None = None,
@@ -603,7 +603,7 @@ def create_isvc(
603603
model_version (str): Model version
604604
wait_for_predictor_pods (bool): Wait for predictor pods
605605
autoscaler_mode (str): Autoscaler mode
606-
multi_node_worker_spec (dict[str, int]): Multi node worker spec
606+
multi_node_worker_spec (dict[str, Any]): Multi node worker spec
607607
timeout (int): Time to wait for the model inference,deployment to be ready
608608
scale_metric (str): Scale metric
609609
scale_target (int): Scale target

utilities/manifests/vllm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
VLLM_INFERENCE_CONFIG = {
22
"default_query_model": {
33
"query_input": '"prompt": "At what temperature does Nitrogen boil?", "max_tokens": 100, "temperature": 0',
4-
"query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4K.*","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}\],"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}}',
4+
"query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4.*","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}\],"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}}',
55
"use_regex": True
66
},
77
"completions": {

0 commit comments

Comments
 (0)