Skip to content

Commit 717d71d

Browse files
feat: add keda scaling tests
Signed-off-by: Vedant Mahabaleshwarkar <vmahabal@redhat.com>
1 parent baa3472 commit 717d71d

11 files changed

Lines changed: 574 additions & 30 deletions

File tree

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ markers =
3030
model_server_gpu: Mark tests which are testing model server with GPU resources
3131
gpu: Mark tests which require GPU resources
3232
multinode: Mark tests which require multiple nodes
33+
keda: Mark tests which are testing KEDA scaling
3334

3435
addopts =
3536
-s
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
from typing import Any, Generator
2+
3+
import pytest
4+
from _pytest.fixtures import FixtureRequest
5+
from kubernetes.dynamic import DynamicClient
6+
from ocp_resources.inference_service import InferenceService
7+
from ocp_resources.namespace import Namespace
8+
from ocp_resources.secret import Secret
9+
from ocp_resources.service_account import ServiceAccount
10+
from ocp_resources.serving_runtime import ServingRuntime
11+
from simple_logger.logger import get_logger
12+
from tests.model_serving.model_runtime.vllm.utils import validate_supported_quantization_schema
13+
from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
14+
15+
from utilities.constants import (
16+
KServeDeploymentType,
17+
RuntimeTemplates,
18+
Labels,
19+
)
20+
from tests.model_serving.model_server.utils import (
21+
run_vllm_concurrent_load,
22+
run_ovms_concurrent_load,
23+
)
24+
from utilities.constants import (
25+
ModelAndFormat,
26+
)
27+
from utilities.inference_utils import create_isvc
28+
from utilities.serving_runtime import ServingRuntimeFromTemplate
29+
from utilities.constants import THANOS_QUERIER_ADDRESS
30+
from syrupy.extensions.json import JSONSnapshotExtension
31+
32+
LOGGER = get_logger(name=__name__)
33+
34+
35+
def create_keda_auto_scaling_config(
36+
query: str,
37+
target_value: str,
38+
) -> dict[str, Any]:
39+
"""Create KEDA auto-scaling configuration for inference services.
40+
41+
Args:
42+
query: The Prometheus query to use for scaling
43+
model_name: Name of the model
44+
namespace: Kubernetes namespace
45+
target_value: Target value for the metric
46+
47+
Returns:
48+
dict: Auto-scaling configuration
49+
"""
50+
return {
51+
"metrics": [
52+
{
53+
"type": "External",
54+
"external": {
55+
"metric": {
56+
"backend": "prometheus",
57+
"serverAddress": THANOS_QUERIER_ADDRESS,
58+
"query": query,
59+
},
60+
"target": {"type": "Value", "value": target_value},
61+
"authenticationRef": {
62+
"authModes": "bearer",
63+
"authenticationRef": {
64+
"name": "inference-prometheus-auth",
65+
},
66+
},
67+
},
68+
}
69+
]
70+
}
71+
72+
73+
@pytest.fixture(scope="class")
74+
def vllm_serving_runtime(
75+
request: FixtureRequest,
76+
admin_client: DynamicClient,
77+
model_namespace: Namespace,
78+
supported_accelerator_type: str,
79+
vllm_runtime_image: str,
80+
) -> Generator[ServingRuntime, None, None]:
81+
accelerator_type = supported_accelerator_type.lower()
82+
template_name = TEMPLATE_MAP.get(accelerator_type, RuntimeTemplates.VLLM_CUDA)
83+
with ServingRuntimeFromTemplate(
84+
client=admin_client,
85+
name="vllm-runtime",
86+
namespace=model_namespace.name,
87+
template_name=template_name,
88+
deployment_type=request.param["deployment_type"],
89+
runtime_image=vllm_runtime_image,
90+
support_tgis_open_ai_endpoints=True,
91+
) as model_runtime:
92+
yield model_runtime
93+
94+
95+
@pytest.fixture(scope="class")
96+
def stressed_keda_vllm_inference_service(
97+
request: FixtureRequest,
98+
admin_client: DynamicClient,
99+
model_namespace: Namespace,
100+
vllm_serving_runtime: ServingRuntime,
101+
supported_accelerator_type: str,
102+
s3_models_storage_uri: str,
103+
model_service_account: ServiceAccount,
104+
) -> Generator[InferenceService, Any, Any]:
105+
isvc_kwargs = {
106+
"client": admin_client,
107+
"name": request.param["name"],
108+
"namespace": model_namespace.name,
109+
"runtime": vllm_serving_runtime.name,
110+
"storage_uri": s3_models_storage_uri,
111+
"model_format": vllm_serving_runtime.instance.spec.supportedModelFormats[0].name,
112+
"model_service_account": model_service_account.name,
113+
"deployment_mode": request.param.get("deployment_mode", KServeDeploymentType.RAW_DEPLOYMENT),
114+
"autoscaler_mode": "keda",
115+
"external_route": True,
116+
}
117+
accelerator_type = supported_accelerator_type.lower()
118+
gpu_count = request.param.get("gpu_count")
119+
timeout = request.param.get("timeout")
120+
identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
121+
resources: Any = PREDICT_RESOURCES["resources"]
122+
resources["requests"][identifier] = gpu_count
123+
resources["limits"][identifier] = gpu_count
124+
isvc_kwargs["resources"] = resources
125+
if timeout:
126+
isvc_kwargs["timeout"] = timeout
127+
if gpu_count > 1:
128+
isvc_kwargs["volumes"] = PREDICT_RESOURCES["volumes"]
129+
isvc_kwargs["volumes_mounts"] = PREDICT_RESOURCES["volume_mounts"]
130+
if arguments := request.param.get("runtime_argument"):
131+
arguments = [
132+
arg
133+
for arg in arguments
134+
if not (arg.startswith("--tensor-parallel-size") or arg.startswith("--quantization"))
135+
]
136+
arguments.append(f"--tensor-parallel-size={gpu_count}")
137+
if quantization := request.param.get("quantization"):
138+
validate_supported_quantization_schema(q_type=quantization)
139+
arguments.append(f"--quantization={quantization}")
140+
isvc_kwargs["argument"] = arguments
141+
142+
isvc_kwargs["min_replicas"] = request.param.get("initial_pod_count")
143+
isvc_kwargs["max_replicas"] = request.param.get("final_pod_count")
144+
145+
isvc_kwargs["auto_scaling"] = create_keda_auto_scaling_config(
146+
query=request.param.get("metrics_query"),
147+
model_name=request.param["name"],
148+
namespace=model_namespace.name,
149+
target_value=str(request.param.get("metrics_threshold")),
150+
)
151+
152+
with create_isvc(**isvc_kwargs) as isvc:
153+
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
154+
run_vllm_concurrent_load(isvc=isvc, response_snapshot=response_snapshot)
155+
yield isvc
156+
157+
158+
@pytest.fixture(scope="class")
159+
def stressed_ovms_keda_inference_service(
160+
request: FixtureRequest,
161+
unprivileged_client: DynamicClient,
162+
unprivileged_model_namespace: Namespace,
163+
ovms_kserve_serving_runtime: ServingRuntime,
164+
models_endpoint_s3_secret: Secret,
165+
) -> Generator[InferenceService, Any, Any]:
166+
model_name = f"{request.param['name']}-raw"
167+
with create_isvc(
168+
client=unprivileged_client,
169+
name=model_name,
170+
namespace=unprivileged_model_namespace.name,
171+
external_route=True,
172+
runtime=ovms_kserve_serving_runtime.name,
173+
storage_path=request.param["model-dir"],
174+
storage_key=models_endpoint_s3_secret.name,
175+
model_format=ModelAndFormat.OPENVINO_IR,
176+
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
177+
model_version=request.param["model-version"],
178+
min_replicas=request.param["initial_pod_count"],
179+
max_replicas=request.param["final_pod_count"],
180+
autoscaler_mode="keda",
181+
auto_scaling=create_keda_auto_scaling_config(
182+
query=request.param["metrics_query"],
183+
model_name=model_name,
184+
namespace=unprivileged_model_namespace.name,
185+
target_value=str(request.param["metrics_threshold"]),
186+
),
187+
) as isvc:
188+
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
189+
run_ovms_concurrent_load(isvc=isvc)
190+
yield isvc
191+
192+
193+
@pytest.fixture(scope="session")
194+
def skip_if_no_supported_gpu_type(supported_accelerator_type: str) -> None:
195+
if not supported_accelerator_type:
196+
pytest.skip("Accelartor type is not provide,vLLM test can not be run on CPU")
197+
198+
199+
@pytest.fixture
200+
def response_snapshot(snapshot: Any) -> Any:
201+
return snapshot.use_extension(extension_class=JSONSnapshotExtension)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import pytest
2+
from simple_logger.logger import get_logger
3+
from typing import Any, Generator
4+
from kubernetes.dynamic import DynamicClient
5+
from ocp_resources.namespace import Namespace
6+
from ocp_resources.inference_service import InferenceService
7+
from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
8+
from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
9+
from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import SERVING_ARGUMENT
10+
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs
11+
from utilities.monitoring import validate_metrics_field
12+
13+
LOGGER = get_logger(name=__name__)
14+
15+
16+
BASE_RAW_DEPLOYMENT_CONFIG["runtime_argument"] = SERVING_ARGUMENT
17+
18+
INITIAL_POD_COUNT = 1
19+
FINAL_POD_COUNT = 5
20+
21+
OVMS_MODEL_NAMESPACE = "ovms-keda"
22+
OVMS_MODEL_NAME = "onnx-raw"
23+
OVMS_METRICS_QUERY = (
24+
f"sum by (name) (rate(ovms_inference_time_us_sum{{"
25+
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
26+
f"}}[5m])) / "
27+
f"sum by (name) (rate(ovms_inference_time_us_count{{"
28+
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
29+
f"}}[5m]))"
30+
)
31+
OVMS_METRICS_THRESHOLD = 200
32+
33+
pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("valid_aws_config")]
34+
35+
36+
@pytest.mark.parametrize(
37+
"unprivileged_model_namespace, ovms_kserve_serving_runtime, stressed_ovms_keda_inference_service",
38+
[
39+
pytest.param(
40+
{"name": "ovms-keda"},
41+
RunTimeConfigs.ONNX_OPSET13_RUNTIME_CONFIG,
42+
{
43+
"name": ModelFormat.ONNX,
44+
"model-version": ModelVersion.OPSET13,
45+
"model-dir": "test-dir",
46+
"initial_pod_count": INITIAL_POD_COUNT,
47+
"final_pod_count": FINAL_POD_COUNT,
48+
"metrics_query": OVMS_METRICS_QUERY,
49+
"metrics_threshold": OVMS_METRICS_THRESHOLD,
50+
},
51+
)
52+
],
53+
indirect=True,
54+
)
55+
class TestOVMSKedaScaling:
56+
def test_ovms_keda_scaling_verify_scaledobject(
57+
self,
58+
unprivileged_model_namespace: Namespace,
59+
unprivileged_client: DynamicClient,
60+
ovms_kserve_serving_runtime,
61+
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
62+
):
63+
verify_keda_scaledobject(
64+
client=unprivileged_client,
65+
isvc=stressed_ovms_keda_inference_service,
66+
expected_trigger_type="prometheus",
67+
expected_query=OVMS_METRICS_QUERY,
68+
expected_threshold=OVMS_METRICS_THRESHOLD,
69+
)
70+
71+
def test_ovms_keda_scaling_verify_metrics(
72+
self,
73+
unprivileged_model_namespace: Namespace,
74+
unprivileged_client: DynamicClient,
75+
ovms_kserve_serving_runtime,
76+
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
77+
prometheus,
78+
):
79+
validate_metrics_field(
80+
prometheus=prometheus,
81+
metrics_query=OVMS_METRICS_QUERY,
82+
expected_value=str(OVMS_METRICS_THRESHOLD),
83+
greater_than=True,
84+
)
85+
86+
def test_ovms_keda_scaling_verify_final_pod_count(
87+
self,
88+
unprivileged_model_namespace: Namespace,
89+
unprivileged_client: DynamicClient,
90+
ovms_kserve_serving_runtime,
91+
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
92+
):
93+
verify_final_pod_count(
94+
unprivileged_client=unprivileged_client,
95+
isvc=stressed_ovms_keda_inference_service,
96+
final_pod_count=FINAL_POD_COUNT,
97+
)

0 commit comments

Comments
 (0)