Skip to content

Commit 086c1b4

Browse files
committed
use oci for multi node test
Signed-off-by: Milind waykole <mwaykole@redhat.com>
1 parent 727e0f5 commit 086c1b4

File tree

2 files changed

+44
-67
lines changed

2 files changed

+44
-67
lines changed

tests/model_serving/model_server/kserve/multi_node/conftest.py

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from ocp_resources.inference_service import InferenceService
1010
from ocp_resources.namespace import Namespace
1111
from ocp_resources.node import Node
12-
from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
1312
from ocp_resources.pod import Pod
1413
from ocp_resources.resource import ResourceEditor
1514
from ocp_resources.secret import Secret
@@ -23,11 +22,9 @@
2322
get_pods_by_isvc_generation,
2423
)
2524
from utilities.constants import KServeDeploymentType, Labels, ModelCarImage, Protocols, Timeout
26-
from utilities.general import download_model_data
2725
from utilities.inference_utils import create_isvc
2826
from utilities.infra import (
2927
get_pods_by_isvc_label,
30-
verify_no_failed_pods,
3128
wait_for_inference_deployment_replicas,
3229
)
3330
from utilities.serving_runtime import ServingRuntimeFromTemplate
@@ -46,31 +43,6 @@ def skip_if_no_gpu_nodes(nvidia_gpu_nodes: list[Node]) -> None:
4643
pytest.skip("Multi-node tests can only run on a Cluster with at least 2 GPU Worker nodes")
4744

4845

49-
@pytest.fixture(scope="class")
50-
def models_bucket_downloaded_model_data(
51-
request: FixtureRequest,
52-
admin_client: DynamicClient,
53-
unprivileged_model_namespace: Namespace,
54-
models_s3_bucket_name: str,
55-
model_pvc: PersistentVolumeClaim,
56-
aws_secret_access_key: str,
57-
aws_access_key_id: str,
58-
models_s3_bucket_endpoint: str,
59-
models_s3_bucket_region: str,
60-
) -> str:
61-
return download_model_data(
62-
client=admin_client,
63-
aws_access_key_id=aws_access_key_id,
64-
aws_secret_access_key=aws_secret_access_key,
65-
model_namespace=unprivileged_model_namespace.name,
66-
model_pvc_name=model_pvc.name,
67-
bucket_name=models_s3_bucket_name,
68-
aws_endpoint_url=models_s3_bucket_endpoint,
69-
aws_default_region=models_s3_bucket_region,
70-
model_path=request.param["model-dir"],
71-
)
72-
73-
7446
@pytest.fixture(scope="class")
7547
def multi_node_serving_runtime(
7648
request: FixtureRequest,
@@ -93,26 +65,47 @@ def multi_node_inference_service(
9365
request: FixtureRequest,
9466
unprivileged_client: DynamicClient,
9567
multi_node_serving_runtime: ServingRuntime,
96-
model_pvc: PersistentVolumeClaim,
97-
models_bucket_downloaded_model_data: str,
9868
) -> Generator[InferenceService, Any, Any]:
69+
resources = {
70+
"requests": {
71+
"cpu": "1",
72+
"memory": "4G",
73+
},
74+
"limits": {
75+
"cpu": "2",
76+
"memory": "12G",
77+
},
78+
}
79+
80+
worker_resources = {
81+
"containers": [
82+
{
83+
"name": "worker-container",
84+
"resources": resources,
85+
}
86+
]
87+
}
88+
9989
with create_isvc(
10090
client=unprivileged_client,
10191
name=request.param["name"],
10292
namespace=multi_node_serving_runtime.namespace,
10393
runtime=multi_node_serving_runtime.name,
104-
storage_uri=f"pvc://{model_pvc.name}/{models_bucket_downloaded_model_data}",
94+
storage_uri=ModelCarImage.GRANITE_8B_CODE_INSTRUCT,
10595
model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
10696
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
10797
autoscaler_mode="none",
108-
multi_node_worker_spec={},
98+
resources=resources,
99+
multi_node_worker_spec=worker_resources,
109100
wait_for_predictor_pods=False,
101+
timeout=Timeout.TIMEOUT_30MIN,
110102
) as isvc:
111103
wait_for_inference_deployment_replicas(
112104
client=unprivileged_client,
113105
isvc=isvc,
114106
expected_num_deployments=2,
115107
runtime_name=multi_node_serving_runtime.name,
108+
timeout=Timeout.TIMEOUT_15MIN,
116109
)
117110
yield isvc
118111

@@ -269,16 +262,11 @@ def deleted_multi_node_pod(
269262
role=request.param["pod-role"],
270263
)
271264

272-
verify_no_failed_pods(
273-
client=unprivileged_client,
274-
isvc=multi_node_inference_service,
275-
timeout=Timeout.TIMEOUT_10MIN,
276-
)
277-
278265
wait_for_inference_deployment_replicas(
279266
client=unprivileged_client,
280267
isvc=multi_node_inference_service,
281268
expected_num_deployments=2,
269+
timeout=Timeout.TIMEOUT_15MIN,
282270
)
283271

284272
_warmup_inference_and_wait_for_recovery(
@@ -317,7 +305,7 @@ def _warmup_inference_and_wait_for_recovery(
317305
]
318306

319307
for sample in TimeoutSampler(
320-
wait_timeout=Timeout.TIMEOUT_10MIN,
308+
wait_timeout=Timeout.TIMEOUT_30MIN,
321309
sleep=30,
322310
func=_probe_inference_health,
323311
client=client,

tests/model_serving/model_server/kserve/multi_node/test_nvidia_multi_node.py

Lines changed: 17 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from typing import Any
2-
31
import pytest
42
import structlog
53

@@ -14,12 +12,12 @@
1412
verify_ray_status,
1513
)
1614
from tests.model_serving.model_server.utils import verify_inference_response
17-
from utilities.constants import Labels, Protocols, StorageClassName
15+
from utilities.constants import Protocols
1816
from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
1917

2018
pytestmark = [
2119
pytest.mark.rawdeployment,
22-
pytest.mark.usefixtures("skip_if_no_gpu_nodes", "skip_if_no_nfs_storage_class"),
20+
pytest.mark.usefixtures("skip_if_no_gpu_nodes"),
2321
pytest.mark.model_server_gpu,
2422
pytest.mark.multinode,
2523
pytest.mark.gpu,
@@ -31,16 +29,10 @@
3129

3230

3331
@pytest.mark.parametrize(
34-
"unprivileged_model_namespace, models_bucket_downloaded_model_data, model_pvc, multi_node_inference_service",
32+
"unprivileged_model_namespace, multi_node_inference_service",
3533
[
3634
pytest.param(
3735
{"name": "gpu-multi-node"},
38-
{"model-dir": "granite-8b-code-base"},
39-
{
40-
"access-modes": "ReadWriteMany",
41-
"storage-class-name": StorageClassName.NFS,
42-
"pvc-size": "40Gi",
43-
},
4436
{"name": "multi-vllm"},
4537
)
4638
],
@@ -50,7 +42,7 @@ class TestMultiNode:
5042
"""Validate multi-node GPU inference with Ray-based vLLM serving on KServe.
5143
5244
Steps:
53-
1. Deploy a Granite-8B model on a multi-node vLLM inference service backed by PVC storage.
45+
1. Deploy a Granite-8B model on a multi-node vLLM inference service backed by OCI model image.
5446
2. Verify Ray cluster health and NVIDIA GPU status across head and worker pods.
5547
3. Validate default runtime worker spec (tensorParallelSize=1, pipelineParallelSize=2).
5648
4. Confirm pods are distributed across GPU nodes and TLS certificates are provisioned.
@@ -200,22 +192,19 @@ def test_multi_node_basic_external_inference(self, patched_multi_node_isvc_exter
200192
indirect=True,
201193
)
202194
def test_multi_node_tensor_parallel_size_propagation(self, unprivileged_client, patched_multi_node_spec):
203-
"""Test multi node tensor parallel size (number of GPUs per pod) propagation to pod config"""
204-
isvc_parallel_size = str(patched_multi_node_spec.instance.spec.predictor.workerSpec.tensorParallelSize)
205-
206-
failed_pods: list[dict[str, Any]] = []
207-
208-
for pod in get_pods_by_isvc_generation(client=unprivileged_client, isvc=patched_multi_node_spec):
209-
pod_resources = pod.instance.spec.containers[0].resources
210-
if not (
211-
isvc_parallel_size
212-
== pod_resources.limits[Labels.Nvidia.NVIDIA_COM_GPU]
213-
== pod_resources.requests[Labels.Nvidia.NVIDIA_COM_GPU]
214-
):
215-
failed_pods.append({pod.name: pod_resources})
216-
217-
if failed_pods:
218-
pytest.fail(f"Failed pods resources : {failed_pods}, expected tesnor parallel size {isvc_parallel_size}")
195+
"""Test multi node tensor parallel size propagation to pod count"""
196+
worker_spec = patched_multi_node_spec.instance.spec.predictor.workerSpec
197+
expected_pod_count = worker_spec.tensorParallelSize * worker_spec.pipelineParallelSize
198+
199+
pods = get_pods_by_isvc_generation(client=unprivileged_client, isvc=patched_multi_node_spec)
200+
201+
if len(pods) != expected_pod_count:
202+
pytest.fail(
203+
f"Expected {expected_pod_count} pods "
204+
f"(tensorParallelSize={worker_spec.tensorParallelSize} "
205+
f"* pipelineParallelSize={worker_spec.pipelineParallelSize}), "
206+
f"but found {len(pods)} pods"
207+
)
219208

220209
@pytest.mark.parametrize(
221210
"patched_multi_node_spec",

0 commit comments

Comments
 (0)