Skip to content

Commit 86aedad

Browse files
committed
use oci for multi node test
Signed-off-by: Milind waykole <mwaykole@redhat.com>
1 parent 654f1b6 commit 86aedad

File tree

5 files changed

+84
-113
lines changed

5 files changed

+84
-113
lines changed

tests/model_serving/model_server/kserve/multi_node/conftest.py

Lines changed: 24 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,19 @@
66
from ocp_resources.inference_service import InferenceService
77
from ocp_resources.namespace import Namespace
88
from ocp_resources.node import Node
9-
from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
109
from ocp_resources.pod import Pod
1110
from ocp_resources.resource import ResourceEditor
1211
from ocp_resources.secret import Secret
1312
from ocp_resources.serving_runtime import ServingRuntime
1413
from pytest_testconfig import config as py_config
15-
from timeout_sampler import TimeoutSampler
1614

1715
from tests.model_serving.model_server.kserve.multi_node.utils import (
1816
delete_multi_node_pod_by_role,
17+
wait_for_vllm_health,
1918
)
19+
from timeout_sampler import TimeoutSampler
20+
2021
from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout, ModelCarImage
21-
from utilities.general import download_model_data
2222
from utilities.inference_utils import create_isvc
2323
from utilities.infra import (
2424
get_pods_by_isvc_label,
@@ -33,37 +33,20 @@ def nvidia_gpu_nodes(nodes: list[Node]) -> list[Node]:
3333
return [node for node in nodes if "nvidia.com/gpu.present" in node.labels.keys()]
3434

3535

36+
@pytest.fixture(scope="session")
37+
def max_gpu_per_node(nvidia_gpu_nodes: list[Node]) -> int:
38+
return max(
39+
(int(node.instance.status.allocatable.get("nvidia.com/gpu", 0)) for node in nvidia_gpu_nodes),
40+
default=0,
41+
)
42+
43+
3644
@pytest.fixture(scope="session")
3745
def skip_if_no_gpu_nodes(nvidia_gpu_nodes: list[Node]) -> None:
3846
if len(nvidia_gpu_nodes) < 2:
3947
pytest.skip("Multi-node tests can only run on a Cluster with at least 2 GPU Worker nodes")
4048

4149

42-
@pytest.fixture(scope="class")
43-
def models_bucket_downloaded_model_data(
44-
request: FixtureRequest,
45-
admin_client: DynamicClient,
46-
unprivileged_model_namespace: Namespace,
47-
models_s3_bucket_name: str,
48-
model_pvc: PersistentVolumeClaim,
49-
aws_secret_access_key: str,
50-
aws_access_key_id: str,
51-
models_s3_bucket_endpoint: str,
52-
models_s3_bucket_region: str,
53-
) -> str:
54-
return download_model_data(
55-
client=admin_client,
56-
aws_access_key_id=aws_access_key_id,
57-
aws_secret_access_key=aws_secret_access_key,
58-
model_namespace=unprivileged_model_namespace.name,
59-
model_pvc_name=model_pvc.name,
60-
bucket_name=models_s3_bucket_name,
61-
aws_endpoint_url=models_s3_bucket_endpoint,
62-
aws_default_region=models_s3_bucket_region,
63-
model_path=request.param["model-dir"],
64-
)
65-
66-
6750
@pytest.fixture(scope="class")
6851
def multi_node_serving_runtime(
6952
request: FixtureRequest,
@@ -86,35 +69,6 @@ def multi_node_inference_service(
8669
request: FixtureRequest,
8770
unprivileged_client: DynamicClient,
8871
multi_node_serving_runtime: ServingRuntime,
89-
model_pvc: PersistentVolumeClaim,
90-
models_bucket_downloaded_model_data: str,
91-
) -> Generator[InferenceService, Any, Any]:
92-
with create_isvc(
93-
client=unprivileged_client,
94-
name=request.param["name"],
95-
namespace=multi_node_serving_runtime.namespace,
96-
runtime=multi_node_serving_runtime.name,
97-
storage_uri=f"pvc://{model_pvc.name}/{models_bucket_downloaded_model_data}",
98-
model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
99-
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
100-
autoscaler_mode="external",
101-
multi_node_worker_spec={},
102-
wait_for_predictor_pods=False,
103-
) as isvc:
104-
wait_for_inference_deployment_replicas(
105-
client=unprivileged_client,
106-
isvc=isvc,
107-
expected_num_deployments=2,
108-
runtime_name=multi_node_serving_runtime.name,
109-
)
110-
yield isvc
111-
112-
113-
@pytest.fixture(scope="class")
114-
def multi_node_oci_inference_service(
115-
request: FixtureRequest,
116-
unprivileged_client: DynamicClient,
117-
multi_node_serving_runtime: ServingRuntime,
11872
) -> Generator[InferenceService, Any, Any]:
11973
resources = {
12074
"requests": {
@@ -136,7 +90,6 @@ def multi_node_oci_inference_service(
13690
]
13791
}
13892

139-
# NOTE: In KServe v0.15, the autoscaler_mode needs to be updated to "none".
14093
with create_isvc(
14194
client=unprivileged_client,
14295
name=request.param["name"],
@@ -145,11 +98,10 @@ def multi_node_oci_inference_service(
14598
storage_uri=ModelCarImage.GRANITE_8B_CODE_INSTRUCT,
14699
model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
147100
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
148-
autoscaler_mode="external",
101+
autoscaler_mode="none",
149102
resources=resources,
150103
multi_node_worker_spec=worker_resources,
151104
wait_for_predictor_pods=False,
152-
external_route=True,
153105
timeout=Timeout.TIMEOUT_30MIN,
154106
) as isvc:
155107
wait_for_inference_deployment_replicas(
@@ -177,6 +129,12 @@ def multi_node_predictor_pods_scope_class(
177129
def patched_multi_node_isvc_external_route(
178130
multi_node_inference_service: InferenceService,
179131
) -> Generator[InferenceService, Any, Any]:
132+
multi_node_inference_service.wait_for_condition(
133+
condition=multi_node_inference_service.Condition.READY,
134+
status=multi_node_inference_service.Condition.Status.TRUE,
135+
timeout=Timeout.TIMEOUT_10MIN,
136+
)
137+
180138
with ResourceEditor(
181139
patches={
182140
multi_node_inference_service: {
@@ -185,7 +143,7 @@ def patched_multi_node_isvc_external_route(
185143
}
186144
):
187145
for sample in TimeoutSampler(
188-
wait_timeout=Timeout.TIMEOUT_1MIN,
146+
wait_timeout=Timeout.TIMEOUT_5MIN,
189147
sleep=1,
190148
func=lambda: multi_node_inference_service.instance.status,
191149
):
@@ -258,3 +216,8 @@ def deleted_multi_node_pod(
258216
isvc=multi_node_inference_service,
259217
timeout=Timeout.TIMEOUT_10MIN,
260218
)
219+
220+
wait_for_vllm_health(
221+
client=unprivileged_client,
222+
isvc=multi_node_inference_service,
223+
)

tests/model_serving/model_server/kserve/multi_node/test_nvidia_multi_node.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
verify_ray_status,
1515
)
1616
from tests.model_serving.model_server.utils import verify_inference_response
17-
from utilities.constants import Labels, Protocols, StorageClassName
17+
from utilities.constants import Labels, Protocols
1818
from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
1919

2020
pytestmark = [
2121
pytest.mark.rawdeployment,
22-
pytest.mark.usefixtures("skip_if_no_gpu_nodes", "skip_if_no_nfs_storage_class"),
22+
pytest.mark.usefixtures("skip_if_no_gpu_nodes"),
2323
pytest.mark.model_server_gpu,
2424
pytest.mark.multinode,
2525
pytest.mark.gpu,
@@ -31,16 +31,10 @@
3131

3232

3333
@pytest.mark.parametrize(
34-
"unprivileged_model_namespace, models_bucket_downloaded_model_data, model_pvc, multi_node_inference_service",
34+
"unprivileged_model_namespace, multi_node_inference_service",
3535
[
3636
pytest.param(
3737
{"name": "gpu-multi-node"},
38-
{"model-dir": "granite-8b-code-base"},
39-
{
40-
"access-modes": "ReadWriteMany",
41-
"storage-class-name": StorageClassName.NFS,
42-
"pvc-size": "40Gi",
43-
},
4438
{"name": "multi-vllm"},
4539
)
4640
],
@@ -52,10 +46,10 @@ def test_multi_node_ray_status(self, multi_node_predictor_pods_scope_class):
5246
verify_ray_status(pods=multi_node_predictor_pods_scope_class)
5347

5448
def test_multi_node_nvidia_gpu_status(self, multi_node_predictor_pods_scope_class):
55-
"""Test multi node ray status"""
49+
"""Test multi node nvidia gpu status"""
5650
verify_nvidia_gpu_status(pod=multi_node_predictor_pods_scope_class[0])
5751

58-
def test_multi_node_default_config(self, multi_node_serving_runtime, multi_node_predictor_pods_scope_class):
52+
def test_multi_node_default_config(self, multi_node_serving_runtime, multi_node_inference_service):
5953
"""Test multi node inference service with default config"""
6054
runtime_worker_spec = multi_node_serving_runtime.instance.spec.workerSpec
6155

@@ -179,17 +173,22 @@ def test_multi_node_basic_external_inference(self, patched_multi_node_isvc_exter
179173
"spec": {
180174
"workerSpec": {
181175
"pipelineParallelSize": 2,
182-
"tensorParallelSize": 4,
176+
"tensorParallelSize": 2,
183177
}
184178
}
185179
})
186180
],
187181
indirect=True,
188182
)
189-
def test_multi_node_tensor_parallel_size_propagation(self, unprivileged_client, patched_multi_node_spec):
183+
def test_multi_node_tensor_parallel_size_propagation(
184+
self, unprivileged_client, patched_multi_node_spec, max_gpu_per_node
185+
):
190186
"""Test multi node tensor parallel size (number of GPUs per pod) propagation to pod config"""
191187
isvc_parallel_size = str(patched_multi_node_spec.instance.spec.predictor.workerSpec.tensorParallelSize)
192188

189+
if int(isvc_parallel_size) > max_gpu_per_node:
190+
pytest.skip(f"tensorParallelSize {isvc_parallel_size} exceeds max GPUs per node ({max_gpu_per_node})")
191+
193192
failed_pods: list[dict[str, Any]] = []
194193

195194
for pod in get_pods_by_isvc_generation(client=unprivileged_client, isvc=patched_multi_node_spec):

tests/model_serving/model_server/kserve/multi_node/test_oci_multi_node.py

Lines changed: 0 additions & 38 deletions
This file was deleted.

tests/model_serving/model_server/kserve/multi_node/utils.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,53 @@ def get_pods_by_isvc_generation(client: DynamicClient, isvc: InferenceService) -
124124
raise ResourceNotFoundError(f"InferenceService {isvc.name} generation {isvc_generation} has no pods")
125125

126126

127+
@retry(wait_timeout=Timeout.TIMEOUT_10MIN, sleep=10)
128+
def wait_for_vllm_health(client: DynamicClient, isvc: InferenceService) -> bool:
129+
"""Wait for vLLM to serve inference successfully on the head pod.
130+
131+
After pod deletion and recovery, the vLLM /health endpoint and Ray
132+
node count may report healthy before the distributed inference
133+
pipeline is fully rebuilt. This function verifies the model can
134+
serve an actual inference request via pod exec on the head pod.
135+
136+
Args:
137+
client: Dynamic client.
138+
isvc: InferenceService object.
139+
140+
Returns:
141+
True when inference succeeds.
142+
143+
Raises:
144+
RuntimeError: If inference check fails.
145+
"""
146+
for pod in get_pods_by_isvc_label(client=client, isvc=isvc):
147+
if WORKER_POD_ROLE not in pod.name:
148+
result = pod.execute(
149+
command=[
150+
"curl",
151+
"-s",
152+
"-o",
153+
"/dev/null",
154+
"-w",
155+
"%{http_code}",
156+
"-X",
157+
"POST",
158+
"http://localhost:8080/v1/completions",
159+
"-H",
160+
"Content-Type: application/json",
161+
"-d",
162+
f'{{"model":"{isvc.name}","prompt":"test","max_tokens":1}}',
163+
]
164+
)
165+
if result.strip().strip("'") != "200":
166+
raise RuntimeError(f"vLLM inference check returned {result} on head pod {pod.name}")
167+
168+
LOGGER.info(f"vLLM inference check passed on head pod {pod.name}")
169+
return True
170+
171+
raise RuntimeError(f"No head pod found for InferenceService {isvc.name}")
172+
173+
127174
def is_arg_in_model_spec(client: DynamicClient, isvc: InferenceService, arg: str) -> bool:
128175
"""
129176
Check if arg is in model spec; spec.model.args are only added to head pod

utilities/manifests/vllm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
VLLM_INFERENCE_CONFIG = {
22
"default_query_model": {
33
"query_input": '"prompt": "At what temperature does Nitrogen boil?", "max_tokens": 100, "temperature": 0',
4-
"query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4.*","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}\],"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}}',
4+
"query_output": r'{"id":"cmpl-[a-z0-9]+","object":"text_completion","created":\d+,"model":"$model_name","choices":\[{"index":0,"text":".*Theboilingpointofnitrogenis77.4.*","logprobs":null,"finish_reason":"length","stop_reason":null(,"[a-z_]+":null)*}\](,"[a-z_]+":null)*,"usage":{"prompt_tokens":10,"total_tokens":110,"completion_tokens":100,"prompt_tokens_details":null}(,"[a-z_]+":null)*}',
55
"use_regex": True
66
},
77
"completions": {

0 commit comments

Comments
 (0)