Skip to content

Commit 5ae3512

Browse files
feat: add keda scaling tests
Signed-off-by: Vedant Mahabaleshwarkar <vmahabal@redhat.com>
1 parent baa3472 commit 5ae3512

9 files changed

Lines changed: 532 additions & 4 deletions

File tree

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ markers =
3030
model_server_gpu: Mark tests which are testing model server with GPU resources
3131
gpu: Mark tests which require GPU resources
3232
multinode: Mark tests which require multiple nodes
33+
keda: Mark tests which are testing KEDA scaling
3334

3435
addopts =
3536
-s
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
from typing import Any, Generator
2+
3+
import pytest
4+
from _pytest.fixtures import FixtureRequest
5+
from kubernetes.dynamic import DynamicClient
6+
from ocp_resources.inference_service import InferenceService
7+
from ocp_resources.namespace import Namespace
8+
from ocp_resources.secret import Secret
9+
from ocp_resources.service_account import ServiceAccount
10+
from ocp_resources.serving_runtime import ServingRuntime
11+
from simple_logger.logger import get_logger
12+
from tests.model_serving.model_runtime.vllm.utils import validate_supported_quantization_schema
13+
from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
14+
15+
from utilities.constants import (
16+
KServeDeploymentType,
17+
RuntimeTemplates,
18+
Labels,
19+
)
20+
from tests.model_serving.model_server.utils import (
21+
run_vllm_concurrent_load,
22+
run_ovms_concurrent_load,
23+
)
24+
from utilities.constants import (
25+
ModelAndFormat,
26+
)
27+
from utilities.inference_utils import create_isvc
28+
from utilities.serving_runtime import ServingRuntimeFromTemplate
29+
from utilities.constants import THANOS_QUERIER_ADDRESS
30+
from syrupy.extensions.json import JSONSnapshotExtension
31+
32+
LOGGER = get_logger(name=__name__)
33+
34+
35+
def create_keda_auto_scaling_config(
36+
query: str,
37+
target_value: str,
38+
) -> dict[str, Any]:
39+
"""Create KEDA auto-scaling configuration for inference services.
40+
41+
Args:
42+
query: The Prometheus query to use for scaling
43+
model_name: Name of the model
44+
namespace: Kubernetes namespace
45+
target_value: Target value for the metric
46+
47+
Returns:
48+
dict: Auto-scaling configuration
49+
"""
50+
return {
51+
"metrics": [
52+
{
53+
"type": "External",
54+
"external": {
55+
"metric": {
56+
"backend": "prometheus",
57+
"serverAddress": THANOS_QUERIER_ADDRESS,
58+
"query": query,
59+
},
60+
"target": {"type": "Value", "value": target_value},
61+
"authenticationRef": {
62+
"authModes": "bearer",
63+
"authenticationRef": {
64+
"name": "inference-prometheus-auth",
65+
},
66+
},
67+
},
68+
}
69+
]
70+
}
71+
72+
73+
@pytest.fixture(scope="class")
74+
def vllm_serving_runtime(
75+
request: FixtureRequest,
76+
admin_client: DynamicClient,
77+
model_namespace: Namespace,
78+
supported_accelerator_type: str,
79+
vllm_runtime_image: str,
80+
) -> Generator[ServingRuntime, None, None]:
81+
accelerator_type = supported_accelerator_type.lower()
82+
template_name = TEMPLATE_MAP.get(accelerator_type, RuntimeTemplates.VLLM_CUDA)
83+
with ServingRuntimeFromTemplate(
84+
client=admin_client,
85+
name="vllm-runtime",
86+
namespace=model_namespace.name,
87+
template_name=template_name,
88+
deployment_type=request.param["deployment_type"],
89+
runtime_image=vllm_runtime_image,
90+
support_tgis_open_ai_endpoints=True,
91+
) as model_runtime:
92+
yield model_runtime
93+
94+
95+
@pytest.fixture(scope="class")
96+
def stressed_keda_vllm_inference_service(
97+
request: FixtureRequest,
98+
admin_client: DynamicClient,
99+
model_namespace: Namespace,
100+
vllm_serving_runtime: ServingRuntime,
101+
supported_accelerator_type: str,
102+
s3_models_storage_uri: str,
103+
vllm_model_service_account: ServiceAccount,
104+
) -> Generator[InferenceService, Any, Any]:
105+
isvc_kwargs = {
106+
"client": admin_client,
107+
"name": request.param["name"],
108+
"namespace": model_namespace.name,
109+
"runtime": vllm_serving_runtime.name,
110+
"storage_uri": s3_models_storage_uri,
111+
"model_format": vllm_serving_runtime.instance.spec.supportedModelFormats[0].name,
112+
"model_service_account": vllm_model_service_account.name,
113+
"deployment_mode": request.param.get("deployment_mode", KServeDeploymentType.RAW_DEPLOYMENT),
114+
"autoscaler_mode": "keda",
115+
"external_route": True,
116+
}
117+
accelerator_type = supported_accelerator_type.lower()
118+
gpu_count = request.param.get("gpu_count")
119+
timeout = request.param.get("timeout")
120+
identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, Labels.Nvidia.NVIDIA_COM_GPU)
121+
resources: Any = PREDICT_RESOURCES["resources"]
122+
resources["requests"][identifier] = gpu_count
123+
resources["limits"][identifier] = gpu_count
124+
isvc_kwargs["resources"] = resources
125+
if timeout:
126+
isvc_kwargs["timeout"] = timeout
127+
if gpu_count > 1:
128+
isvc_kwargs["volumes"] = PREDICT_RESOURCES["volumes"]
129+
isvc_kwargs["volumes_mounts"] = PREDICT_RESOURCES["volume_mounts"]
130+
if arguments := request.param.get("runtime_argument"):
131+
arguments = [
132+
arg
133+
for arg in arguments
134+
if not (arg.startswith("--tensor-parallel-size") or arg.startswith("--quantization"))
135+
]
136+
arguments.append(f"--tensor-parallel-size={gpu_count}")
137+
if quantization := request.param.get("quantization"):
138+
validate_supported_quantization_schema(q_type=quantization)
139+
arguments.append(f"--quantization={quantization}")
140+
isvc_kwargs["argument"] = arguments
141+
142+
isvc_kwargs["min_replicas"] = request.param.get("initial_pod_count")
143+
isvc_kwargs["max_replicas"] = request.param.get("final_pod_count")
144+
145+
isvc_kwargs["auto_scaling"] = create_keda_auto_scaling_config(
146+
query=request.param.get("metrics_query"),
147+
model_name=request.param["name"],
148+
namespace=model_namespace.name,
149+
target_value=str(request.param.get("metrics_threshold")),
150+
)
151+
152+
with create_isvc(**isvc_kwargs) as isvc:
153+
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
154+
run_vllm_concurrent_load(isvc=isvc, response_snapshot=response_snapshot)
155+
yield isvc
156+
157+
158+
@pytest.fixture(scope="class")
159+
def stressed_ovms_keda_inference_service(
160+
request: FixtureRequest,
161+
unprivileged_client: DynamicClient,
162+
unprivileged_model_namespace: Namespace,
163+
ovms_kserve_serving_runtime: ServingRuntime,
164+
models_endpoint_s3_secret: Secret,
165+
) -> Generator[InferenceService, Any, Any]:
166+
model_name = f"{request.param['name']}-raw"
167+
with create_isvc(
168+
client=unprivileged_client,
169+
name=model_name,
170+
namespace=unprivileged_model_namespace.name,
171+
external_route=True,
172+
runtime=ovms_kserve_serving_runtime.name,
173+
storage_path=request.param["model-dir"],
174+
storage_key=models_endpoint_s3_secret.name,
175+
model_format=ModelAndFormat.OPENVINO_IR,
176+
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
177+
model_version=request.param["model-version"],
178+
min_replicas=request.param["initial_pod_count"],
179+
max_replicas=request.param["final_pod_count"],
180+
autoscaler_mode="keda",
181+
auto_scaling=create_keda_auto_scaling_config(
182+
query=request.param["metrics_query"],
183+
model_name=model_name,
184+
namespace=unprivileged_model_namespace.name,
185+
target_value=str(request.param["metrics_threshold"]),
186+
),
187+
) as isvc:
188+
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
189+
run_ovms_concurrent_load(isvc=isvc)
190+
yield isvc
191+
192+
193+
@pytest.fixture(scope="session")
194+
def skip_if_no_supported_gpu_type(supported_accelerator_type: str) -> None:
195+
if not supported_accelerator_type:
196+
pytest.skip("Accelartor type is not provide,vLLM test can not be run on CPU")
197+
198+
199+
@pytest.fixture
200+
def response_snapshot(snapshot: Any) -> Any:
201+
return snapshot.use_extension(extension_class=JSONSnapshotExtension)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import pytest
2+
from simple_logger.logger import get_logger
3+
from typing import Any, Generator
4+
from kubernetes.dynamic import DynamicClient
5+
from ocp_resources.inference_service import InferenceService
6+
from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
7+
from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
8+
from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import SERVING_ARGUMENT
9+
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs
10+
from utilities.monitoring import validate_metrics_field
11+
12+
LOGGER = get_logger(name=__name__)
13+
14+
15+
BASE_RAW_DEPLOYMENT_CONFIG["runtime_argument"] = SERVING_ARGUMENT
16+
17+
INITIAL_POD_COUNT = 1
18+
FINAL_POD_COUNT = 5
19+
20+
OVMS_MODEL_NAMESPACE = "ovms-keda"
21+
OVMS_MODEL_NAME = "onnx-raw"
22+
OVMS_METRICS_QUERY = (
23+
f"sum by (name) (rate(ovms_inference_time_us_sum{{"
24+
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
25+
f"}}[5m])) / "
26+
f"sum by (name) (rate(ovms_inference_time_us_count{{"
27+
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
28+
f"}}[5m]))"
29+
)
30+
OVMS_METRICS_THRESHOLD = 200
31+
32+
pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("valid_aws_config")]
33+
34+
35+
@pytest.mark.parametrize(
36+
"unprivileged_model_namespace, ovms_kserve_serving_runtime, stressed_ovms_keda_inference_service",
37+
[
38+
pytest.param(
39+
{"name": "ovms-keda"},
40+
RunTimeConfigs.ONNX_OPSET13_RUNTIME_CONFIG,
41+
{
42+
"name": ModelFormat.ONNX,
43+
"model-version": ModelVersion.OPSET13,
44+
"model-dir": "test-dir",
45+
"initial_pod_count": INITIAL_POD_COUNT,
46+
"final_pod_count": FINAL_POD_COUNT,
47+
"metrics_query": OVMS_METRICS_QUERY,
48+
"metrics_threshold": OVMS_METRICS_THRESHOLD,
49+
},
50+
)
51+
],
52+
indirect=True,
53+
)
54+
class TestOVMSKedaScaling:
55+
def test_ovms_keda_scaling_verify_scaledobject(
56+
self,
57+
unprivileged_client: DynamicClient,
58+
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
59+
):
60+
verify_keda_scaledobject(
61+
client=unprivileged_client,
62+
isvc=stressed_ovms_keda_inference_service,
63+
expected_trigger_type="prometheus",
64+
expected_query=OVMS_METRICS_QUERY,
65+
expected_threshold=OVMS_METRICS_THRESHOLD,
66+
)
67+
68+
def test_ovms_keda_scaling_verify_metrics(
69+
self,
70+
prometheus,
71+
):
72+
validate_metrics_field(
73+
prometheus=prometheus,
74+
metrics_query=OVMS_METRICS_QUERY,
75+
expected_value=str(OVMS_METRICS_THRESHOLD),
76+
greater_than=True,
77+
)
78+
79+
def test_ovms_keda_scaling_verify_final_pod_count(
80+
self,
81+
unprivileged_client: DynamicClient,
82+
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
83+
):
84+
verify_final_pod_count(
85+
unprivileged_client=unprivileged_client,
86+
isvc=stressed_ovms_keda_inference_service,
87+
final_pod_count=FINAL_POD_COUNT,
88+
)
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import pytest
2+
from simple_logger.logger import get_logger
3+
from typing import Any, Generator
4+
from kubernetes.dynamic import DynamicClient
5+
from ocp_resources.inference_service import InferenceService
6+
from utilities.constants import KServeDeploymentType
7+
from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
8+
from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
9+
from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import (
10+
SERVING_ARGUMENT,
11+
MODEL_PATH,
12+
)
13+
from utilities.monitoring import validate_metrics_field
14+
15+
LOGGER = get_logger(name=__name__)
16+
17+
18+
BASE_RAW_DEPLOYMENT_CONFIG["runtime_argument"] = SERVING_ARGUMENT
19+
20+
INITIAL_POD_COUNT = 1
21+
FINAL_POD_COUNT = 5
22+
23+
VLLM_MODEL_NAME = "granite-vllm-keda"
24+
VLLM_METRICS_QUERY_REQUESTS = f'vllm:num_requests_running{{namespace="{VLLM_MODEL_NAME}",pod=~"{VLLM_MODEL_NAME}.*"}}'
25+
VLLM_METRICS_THRESHOLD_REQUESTS = 4
26+
27+
pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("skip_if_no_supported_gpu_type", "valid_aws_config")]
28+
29+
30+
@pytest.mark.parametrize(
31+
"model_namespace, s3_models_storage_uri, vllm_serving_runtime, stressed_keda_vllm_inference_service",
32+
[
33+
pytest.param(
34+
{"name": VLLM_MODEL_NAME},
35+
{"model-dir": MODEL_PATH},
36+
{"deployment_type": KServeDeploymentType.RAW_DEPLOYMENT},
37+
{
38+
**BASE_RAW_DEPLOYMENT_CONFIG,
39+
"gpu_count": 1,
40+
"name": VLLM_MODEL_NAME,
41+
"initial_pod_count": INITIAL_POD_COUNT,
42+
"final_pod_count": FINAL_POD_COUNT,
43+
"metrics_query": VLLM_METRICS_QUERY_REQUESTS,
44+
"metrics_threshold": VLLM_METRICS_THRESHOLD_REQUESTS,
45+
},
46+
id=f"{VLLM_MODEL_NAME}-single-gpu",
47+
),
48+
],
49+
indirect=True,
50+
)
51+
class TestVllmKedaScaling:
52+
def test_vllm_keda_scaling_verify_scaledobject(
53+
self,
54+
unprivileged_client: DynamicClient,
55+
stressed_keda_vllm_inference_service: Generator[InferenceService, Any, Any],
56+
):
57+
verify_keda_scaledobject(
58+
client=unprivileged_client,
59+
isvc=stressed_keda_vllm_inference_service,
60+
expected_trigger_type="prometheus",
61+
expected_query=VLLM_METRICS_QUERY_REQUESTS,
62+
expected_threshold=VLLM_METRICS_THRESHOLD_REQUESTS,
63+
)
64+
65+
def test_vllm_keda_scaling_verify_metrics(
66+
self,
67+
prometheus,
68+
):
69+
validate_metrics_field(
70+
prometheus=prometheus,
71+
metrics_query=VLLM_METRICS_QUERY_REQUESTS,
72+
expected_value=str(VLLM_METRICS_THRESHOLD_REQUESTS),
73+
greater_than=True,
74+
)
75+
76+
def test_vllm_keda_scaling_verify_final_pod_count(
77+
self,
78+
unprivileged_client: DynamicClient,
79+
stressed_keda_vllm_inference_service: Generator[InferenceService, Any, Any],
80+
):
81+
verify_final_pod_count(
82+
unprivileged_client=unprivileged_client,
83+
isvc=stressed_keda_vllm_inference_service,
84+
final_pod_count=FINAL_POD_COUNT,
85+
)

0 commit comments

Comments
 (0)