Skip to content

Commit 896ba74

Browse files
authored
vllm spyre runtime model validation automation (#700)
* feat(model-validation): vllm spyre runtime model validation automation Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> * feat(model-validation): add env variables, fix template issues * feat(model-validation): add env variables, fix template issues * feat(model-validation): setup testing ports. fix serving runtime Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> * feat(model-validation): inference port * feat(model-validation): add progress deadline for serverless deployment * feat(model-validation): get model name before sending request Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> * feat(model-validation): skip serverless tests for spyre * feat(model-validation): skip serverless tests for spyre * feat(model-validation): skip serverless tests for spyre * feat(model-validation): address PR comments Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> * feat(spyre-model-validation): use template on RHOAI Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> * feat(spyre-model-validation): use template on RHOAI Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com> --------- Signed-off-by: Edward Arthur Quarm Jnr <equarmjn@redhat.com>
1 parent a595065 commit 896ba74

File tree

9 files changed

+62
-13
lines changed

9 files changed

+62
-13
lines changed

tests/conftest.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import base64
2+
import binascii
23
import os
34
import shutil
45
from ast import literal_eval
@@ -157,7 +158,11 @@ def registry_pull_secret(pytestconfig: Config) -> str:
157158
"Registry pull secret is not set. "
158159
"Either pass with `--registry_pull_secret` or set `OCI_REGISTRY_PULL_SECRET` environment variable"
159160
)
160-
return registry_pull_secret
161+
try:
162+
base64.b64decode(s=registry_pull_secret, validate=True)
163+
return registry_pull_secret
164+
except binascii.Error:
165+
raise ValueError("Registry pull secret is not a valid base64 encoded string")
161166

162167

163168
@pytest.fixture(scope="session")

tests/model_serving/model_runtime/model_validation/conftest.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from utilities.constants import KServeDeploymentType, Labels, RuntimeTemplates
3232
from utilities.inference_utils import create_isvc
3333
from utilities.serving_runtime import ServingRuntimeFromTemplate
34+
3435
from simple_logger.logger import get_logger
3536

3637
LOGGER = get_logger(name=__name__)
@@ -45,6 +46,7 @@ def model_car_serving_runtime(
4546
vllm_runtime_image: str,
4647
) -> Generator[ServingRuntime, None, None]:
4748
accelerator_type = supported_accelerator_type.lower()
49+
4850
template_name = TEMPLATE_MAP.get(accelerator_type, RuntimeTemplates.VLLM_CUDA)
4951
LOGGER.info(f"using template: {template_name}")
5052
assert model_namespace.name is not None
@@ -88,6 +90,20 @@ def vllm_model_car_inference_service(
8890
resources["limits"][identifier] = gpu_count
8991
isvc_kwargs["resources"] = resources
9092

93+
if (
94+
identifier == Labels.Spyre.SPYRE_COM_GPU
95+
and deployment_config.get("deployment_type") == KServeDeploymentType.SERVERLESS
96+
):
97+
pytest.skip("Spyre cluster is not setup with TLS/mTLS")
98+
if identifier == Labels.Spyre.SPYRE_COM_GPU:
99+
isvc_kwargs["scheduler_name"] = "spyre-scheduler"
100+
resources["requests"] = {
101+
"ibm.com/spyre_pf": gpu_count,
102+
}
103+
resources["limits"] = {
104+
"ibm.com/spyre_pf": gpu_count,
105+
}
106+
91107
if timeout:
92108
isvc_kwargs["timeout"] = timeout
93109

@@ -131,6 +147,7 @@ def kserve_registry_pull_secret(
131147
"ACCESS_TYPE": PULL_SECRET_ACCESS_TYPE,
132148
"OCI_HOST": registry_host,
133149
},
150+
type="kubernetes.io/dockerconfigjson",
134151
wait_for_resource=True,
135152
) as secret:
136153
yield secret
@@ -257,7 +274,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
257274
if not isinstance(model_car_data, list):
258275
raise ValueError("Invalid format for `model-car` in YAML. Expected a list of objects.")
259276

260-
# Check if metafunc.cls is not None to avoid linter errors
261277
if not metafunc.cls:
262278
return
263279

tests/model_serving/model_runtime/model_validation/constant.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
AcceleratorType.NVIDIA: Labels.Nvidia.NVIDIA_COM_GPU,
88
AcceleratorType.AMD: "amd.com/gpu",
99
AcceleratorType.GAUDI: "habana.ai/gaudi",
10+
AcceleratorType.SPYRE: Labels.Spyre.SPYRE_COM_GPU,
1011
}
1112

1213
TEMPLATE_MAP: dict[str, str] = {
1314
AcceleratorType.NVIDIA: RuntimeTemplates.VLLM_CUDA,
1415
AcceleratorType.AMD: RuntimeTemplates.VLLM_ROCM,
15-
AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDUI,
16+
AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDI,
17+
AcceleratorType.SPYRE: RuntimeTemplates.VLLM_SPYRE,
1618
}
1719

1820

@@ -71,10 +73,10 @@
7173
],
7274
]
7375

74-
PULL_SECRET_ACCESS_TYPE: str = "WyJQdWxsIl0=" # Base64 encoded value for "Pull"
76+
PULL_SECRET_ACCESS_TYPE: str = '["Pull"]'
7577
PULL_SECRET_NAME: str = "oci-registry-pull-secret"
76-
INFERENCE_SERVICE_PORT: int = 8080
77-
CONTAINER_PORT: int = 8080
78+
SPYRE_INFERENCE_SERVICE_PORT: int = 8000
79+
SPYRE_CONTAINER_PORT: int = 8000
7880
TIMEOUT_20MIN: int = 30 * 60
7981
OPENAI_ENDPOINT_NAME: str = "openai"
8082
TGIS_ENDPOINT_NAME: str = "tgis"

tests/model_serving/model_runtime/model_validation/sample_modelcar_config.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
11
model-car:
2+
- name: granite-3.1-8b-instruct
3+
image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-instruct:1.5
4+
model_output_type: text
5+
serving_arguments:
6+
args:
7+
- "--uvicorn-log-level=info"
8+
- "--max-model-len=2048"
9+
- "--trust-remote-code"
10+
- "--distributed-executor-backend=mp"
11+
gpu_count: 1
12+
213
- name: granite-3.1-8b-base-quantized.w4a16
314
image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-base-quantized-w4a16:1.5
415
model_output_type: text

tests/model_serving/model_runtime/model_validation/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import re
22
from typing import Any
3-
43
from tests.model_serving.model_runtime.vllm.constant import VLLM_SUPPORTED_QUANTIZATION
54

65

tests/model_serving/model_runtime/utils.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
OPENAI_ENDPOINT_NAME,
1111
AUDIO_FILE_URL,
1212
AUDIO_FILE_LOCAL_PATH,
13+
SPYRE_INFERENCE_SERVICE_PORT,
1314
)
1415
from utilities.constants import Ports
1516
from utilities.exceptions import NotSupportedError
@@ -145,13 +146,14 @@ def validate_raw_openai_inference_request(
145146
completion_query: list[dict[str, str]],
146147
model_output_type: str,
147148
model_name: str,
149+
port: int = Ports.REST_PORT,
148150
) -> None:
149151
if model_output_type == "audio":
150152
LOGGER.info("Running audio inference test")
151153
model_info, completion_responses = run_audio_inference(
152154
pod_name=pod_name,
153155
isvc=isvc,
154-
port=Ports.REST_PORT,
156+
port=port,
155157
endpoint=OPENAI_ENDPOINT_NAME,
156158
model_name=model_name,
157159
)
@@ -161,10 +163,13 @@ def validate_raw_openai_inference_request(
161163
return
162164
elif model_output_type == "text":
163165
LOGGER.info("Running text inference test")
166+
scheduler_name = getattr(isvc.instance.spec.predictor, "schedulerName", "") or ""
167+
if scheduler_name.lower() == "spyre-scheduler":
168+
port = SPYRE_INFERENCE_SERVICE_PORT
164169
model_info, completion_responses = run_raw_inference(
165170
pod_name=pod_name,
166171
isvc=isvc,
167-
port=Ports.REST_PORT,
172+
port=port,
168173
endpoint=OPENAI_ENDPOINT_NAME,
169174
completion_query=completion_query,
170175
)
@@ -205,6 +210,8 @@ def fetch_openai_response(
205210
model_name: str,
206211
completion_query: list[dict[str, str]] | None = None,
207212
) -> tuple[Any, list[Any]]:
213+
model_info = OpenAIClient.get_request_http(host=url, endpoint=OpenAIEnpoints.MODELS_INFO)
214+
model_name = model_info[0]["id"] if model_info else model_name
208215
if completion_query is None:
209216
completion_query = COMPLETION_QUERY
210217
completion_responses = []
@@ -216,7 +223,6 @@ def fetch_openai_response(
216223
)
217224
completion_responses.append(completion_response)
218225

219-
model_info = OpenAIClient.get_request_http(host=url, endpoint=OpenAIEnpoints.MODELS_INFO)
220226
return model_info, completion_responses
221227

222228

tests/model_serving/model_runtime/vllm/constant.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
TEMPLATE_MAP: dict[str, str] = {
1616
AcceleratorType.NVIDIA: RuntimeTemplates.VLLM_CUDA,
1717
AcceleratorType.AMD: RuntimeTemplates.VLLM_ROCM,
18-
AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDUI,
18+
AcceleratorType.GAUDI: RuntimeTemplates.VLLM_GAUDI,
1919
}
2020

2121
PREDICT_RESOURCES: dict[str, Union[list[dict[str, Union[str, dict[str, str]]]], dict[str, dict[str, str]]]] = {

utilities/constants.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ class RuntimeTemplates:
7474
TGIS_GRPC_SERVING: str = "tgis-grpc-serving-template"
7575
VLLM_CUDA: str = "vllm-cuda-runtime-template"
7676
VLLM_ROCM: str = "vllm-rocm-runtime-template"
77-
VLLM_GAUDUI: str = "vllm-gaudi-runtime-template"
77+
VLLM_GAUDI: str = "vllm-gaudi-runtime-template"
78+
VLLM_SPYRE: str = "vllm-spyre-x86-runtime-template"
7879
MLSERVER_GRPC: str = "mlserver-grpc-runtime-template"
7980
MLSERVER_REST: str = "mlserver-rest-runtime-template"
8081
TRITON_REST: str = "triton-rest-runtime-template"
@@ -123,7 +124,8 @@ class AcceleratorType:
123124
NVIDIA: str = "nvidia"
124125
AMD: str = "amd"
125126
GAUDI: str = "gaudi"
126-
SUPPORTED_LISTS: list[str] = [NVIDIA, AMD, GAUDI]
127+
SPYRE: str = "spyre"
128+
SUPPORTED_LISTS: list[str] = [NVIDIA, AMD, GAUDI, SPYRE]
127129

128130

129131
class ApiGroups:
@@ -212,6 +214,9 @@ class Nvidia:
212214
class ROCm:
213215
ROCM_GPU: str = "amd.com/gpu"
214216

217+
class Spyre:
218+
SPYRE_COM_GPU: str = "ibm.com/spyre_pf"
219+
215220
class Kueue:
216221
MANAGED: str = "kueue.openshift.io/managed"
217222
QUEUE_NAME: str = "kueue.x-k8s.io/queue-name"

utilities/inference_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ def create_isvc(
583583
protocol_version: str | None = None,
584584
labels: dict[str, str] | None = None,
585585
auto_scaling: dict[str, Any] | None = None,
586+
scheduler_name: str | None = None,
586587
) -> Generator[InferenceService, Any, Any]:
587588
"""
588589
Create InferenceService object.
@@ -618,6 +619,7 @@ def create_isvc(
618619
teardown (bool): Teardown
619620
protocol_version (str): Protocol version of the model server
620621
auto_scaling (dict[str, Any]): Auto scaling configuration for the model
622+
scheduler_name (str): Scheduler name
621623
622624
Yields:
623625
InferenceService: InferenceService object
@@ -712,6 +714,9 @@ def create_isvc(
712714
if protocol_version is not None:
713715
predictor_dict["model"]["protocolVersion"] = protocol_version
714716

717+
if scheduler_name is not None:
718+
predictor_dict["schedulerName"] = scheduler_name
719+
715720
with InferenceService(
716721
client=client,
717722
name=name,

0 commit comments

Comments
 (0)