Skip to content

Commit 8fc6225

Browse files
authored
[model server] multi-node: Add tls and workerSpec tests (opendatahub-io#211)
* Create size-labeler.yml * Delete .github/workflows/size-labeler.yml * model mesh - add auth tests * xx * verify product version in mm container * feat: add mutli node tls tests * feat: add mutli node tls tests * feat: add mutli node tls tests * fix: set default * fix: update code * fix: add wait for exposed * feat: add tensor and pipeline size * feat: add tensor and pipeline size * fix: add expose * fix: fix pod fetching by generation * fix: fix pod fetching by generation * fix: address comment * fix: add tls marker * fix: fix parallel condition * fix: address comment * ci: fix delete pod
1 parent 1395245 commit 8fc6225

13 files changed

Lines changed: 366 additions & 35 deletions

File tree

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ markers =
2121
serverless: Mark tests which are serverless tests
2222
rawdeployment: Mark tests which are raw deployment tests
2323
minio: Mark tests which are using MinIO storage
24+
tls: Mark tests which are testing TLS
2425

2526
addopts =
2627
-s

tests/model_explainability/trustyai_service/drift/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def mlserver_runtime(
6363
protocol_versions=["v2"],
6464
annotations={
6565
f"{ApiGroups.OPENDATAHUB_IO}/accelerator-name": "",
66-
f"{ApiGroups.OPENDATAHUB_IO}/recommended-accelerators": '["nvidia.com/gpu"]',
66+
f"{ApiGroups.OPENDATAHUB_IO}/recommended-accelerators": [Labels.Nvidia.NVIDIA_COM_GPU],
6767
f"{ApiGroups.OPENDATAHUB_IO}/template-display-name": "KServe MLServer",
6868
"prometheus.kserve.io/path": "/metrics",
6969
"prometheus.io/port": str(Ports.REST_PORT),

tests/model_serving/model_runtime/vllm/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
validate_supported_quantization_schema,
1313
skip_if_deployment_mode,
1414
)
15-
from utilities.constants import KServeDeploymentType, RuntimeTemplates
15+
from utilities.constants import KServeDeploymentType, Labels, RuntimeTemplates
1616
from pytest import FixtureRequest
1717
from syrupy.extensions.json import JSONSnapshotExtension
1818
from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
@@ -76,7 +76,7 @@ def vllm_inference_service(
7676
accelerator_type = supported_accelerator_type.lower()
7777
gpu_count = request.param.get("gpu_count")
7878
timeout = request.param.get("timeout")
79-
identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, "nvidia.com/gpu")
79+
identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
8080
resources: Any = PREDICT_RESOURCES["resources"]
8181
resources["requests"][identifier] = gpu_count
8282
resources["limits"][identifier] = gpu_count

tests/model_serving/model_runtime/vllm/constant.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from typing import Any, Union
2-
from utilities.constants import AcceleratorType, KServeDeploymentType, RuntimeTemplates
2+
from utilities.constants import AcceleratorType, KServeDeploymentType, Labels, RuntimeTemplates
33

44
OPENAI_ENDPOINT_NAME: str = "openai"
55
TGIS_ENDPOINT_NAME: str = "tgis"
66
# Quantization
77
VLLM_SUPPORTED_QUANTIZATION: list[str] = ["marlin", "awq"]
88
# Configurations
99
ACCELERATOR_IDENTIFIER: dict[str, str] = {
10-
AcceleratorType.NVIDIA: "nvidia.com/gpu",
10+
AcceleratorType.NVIDIA: Labels.Nvidia.NVIDIA_COM_GPU,
1111
AcceleratorType.AMD: "amd.com/gpu",
1212
AcceleratorType.GAUDI: "habana.ai/gaudi",
1313
}

tests/model_serving/model_server/multi_node/conftest.py

Lines changed: 122 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,24 @@
88
from ocp_resources.node import Node
99
from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
1010
from ocp_resources.pod import Pod
11+
from ocp_resources.resource import ResourceEditor
12+
from ocp_resources.secret import Secret
1113
from ocp_resources.serving_runtime import ServingRuntime
14+
from pytest_testconfig import config as py_config
15+
from timeout_sampler import TimeoutSampler
1216

13-
from utilities.constants import KServeDeploymentType
17+
from tests.model_serving.model_server.multi_node.utils import (
18+
delete_multi_node_pod_by_role,
19+
)
20+
from utilities.constants import KServeDeploymentType, Labels, Protocols, Timeout
1421
from utilities.general import download_model_data
1522
from utilities.inference_utils import create_isvc
1623
from utilities.infra import (
1724
get_pods_by_isvc_label,
25+
verify_no_failed_pods,
1826
wait_for_inference_deployment_replicas,
1927
)
28+
from utilities.serving_runtime import ServingRuntimeFromTemplate
2029

2130

2231
@pytest.fixture(scope="session")
@@ -61,28 +70,47 @@ def models_bucket_downloaded_model_data(
6170

6271

6372
@pytest.fixture(scope="class")
64-
def multi_node_inference_service(
73+
def multi_node_serving_runtime(
6574
request: FixtureRequest,
6675
admin_client: DynamicClient,
6776
model_namespace: Namespace,
68-
serving_runtime_from_template: ServingRuntime,
77+
) -> Generator[ServingRuntime, Any, Any]:
78+
with ServingRuntimeFromTemplate(
79+
client=admin_client,
80+
name="vllm-multinode-runtime", # TODO: rename servingruntime when RHOAIENG-16147 is resolved
81+
namespace=model_namespace.name,
82+
template_name="vllm-multinode-runtime-template",
83+
multi_model=False,
84+
enable_http=True,
85+
) as model_runtime:
86+
yield model_runtime
87+
88+
89+
@pytest.fixture(scope="class")
90+
def multi_node_inference_service(
91+
request: FixtureRequest,
92+
admin_client: DynamicClient,
93+
multi_node_serving_runtime: ServingRuntime,
6994
model_pvc: PersistentVolumeClaim,
7095
models_bucket_downloaded_model_data: str,
7196
) -> Generator[InferenceService, Any, Any]:
7297
with create_isvc(
7398
client=admin_client,
7499
name=request.param["name"],
75-
namespace=model_namespace.name,
76-
runtime=serving_runtime_from_template.name,
100+
namespace=multi_node_serving_runtime.namespace,
101+
runtime=multi_node_serving_runtime.name,
77102
storage_uri=f"pvc://{model_pvc.name}/{models_bucket_downloaded_model_data}",
78-
model_format=serving_runtime_from_template.instance.spec.supportedModelFormats[0].name,
103+
model_format=multi_node_serving_runtime.instance.spec.supportedModelFormats[0].name,
79104
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
80105
autoscaler_mode="external",
81106
multi_node_worker_spec={},
82107
wait_for_predictor_pods=False,
83108
) as isvc:
84109
wait_for_inference_deployment_replicas(
85-
client=admin_client, isvc=isvc, expected_num_deployments=2, runtime_name=serving_runtime_from_template.name
110+
client=admin_client,
111+
isvc=isvc,
112+
expected_num_deployments=2,
113+
runtime_name=multi_node_serving_runtime.name,
86114
)
87115
yield isvc
88116

@@ -96,3 +124,90 @@ def multi_node_predictor_pods_scope_class(
96124
client=admin_client,
97125
isvc=multi_node_inference_service,
98126
)
127+
128+
129+
@pytest.fixture(scope="function")
130+
def patched_multi_node_isvc_external_route(
131+
multi_node_inference_service: InferenceService,
132+
) -> Generator[InferenceService, Any, Any]:
133+
with ResourceEditor(
134+
patches={
135+
multi_node_inference_service: {
136+
"metadata": {"labels": {Labels.Kserve.NETWORKING_KSERVE_IO: Labels.Kserve.EXPOSED}},
137+
}
138+
}
139+
):
140+
for sample in TimeoutSampler(
141+
wait_timeout=Timeout.TIMEOUT_1MIN,
142+
sleep=1,
143+
func=lambda: multi_node_inference_service.instance.status,
144+
):
145+
if sample and sample.get("url", "").startswith(Protocols.HTTPS):
146+
break
147+
148+
yield multi_node_inference_service
149+
150+
151+
@pytest.fixture(scope="function")
152+
def patched_multi_node_worker_spec(
153+
request: FixtureRequest,
154+
multi_node_inference_service: InferenceService,
155+
) -> Generator[InferenceService, Any, Any]:
156+
with ResourceEditor(
157+
patches={
158+
multi_node_inference_service: {
159+
"spec": {
160+
"predictor": {"workerSpec": request.param["worker-spec"]},
161+
},
162+
}
163+
}
164+
):
165+
yield multi_node_inference_service
166+
167+
168+
@pytest.fixture()
169+
def ray_ca_tls_secret(admin_client: DynamicClient) -> Secret:
170+
return Secret(
171+
client=admin_client,
172+
name="ray-ca-tls",
173+
namespace=py_config["applications_namespace"],
174+
)
175+
176+
177+
@pytest.fixture()
178+
def ray_tls_secret(admin_client: DynamicClient, multi_node_inference_service: InferenceService) -> Secret:
179+
return Secret(
180+
client=admin_client,
181+
name="ray-tls",
182+
namespace=multi_node_inference_service.namespace,
183+
)
184+
185+
186+
@pytest.fixture()
187+
def deleted_serving_runtime(
188+
multi_node_serving_runtime: ServingRuntime,
189+
) -> Generator[None, Any, None]:
190+
multi_node_serving_runtime.clean_up()
191+
192+
yield
193+
194+
multi_node_serving_runtime.deploy()
195+
196+
197+
@pytest.fixture()
198+
def deleted_multi_node_pod(
199+
request: FixtureRequest,
200+
admin_client: DynamicClient,
201+
multi_node_inference_service: InferenceService,
202+
) -> None:
203+
delete_multi_node_pod_by_role(
204+
client=admin_client,
205+
isvc=multi_node_inference_service,
206+
role=request.param["pod-role"],
207+
)
208+
209+
verify_no_failed_pods(
210+
client=admin_client,
211+
isvc=multi_node_inference_service,
212+
timeout=Timeout.TIMEOUT_10MIN,
213+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
HEAD_POD_ROLE: str = "head"
2+
WORKER_POD_ROLE: str = "worker"
3+
SUPPORTED_ROLES: set[str] = {HEAD_POD_ROLE, WORKER_POD_ROLE}

0 commit comments

Comments
 (0)