Skip to content

Commit efdc2dd

Browse files
committed
fix test for keda cpu
Signed-off-by: Milind Waykole <mwaykole@redhat.com>
1 parent c364c47 commit efdc2dd

4 files changed

Lines changed: 39 additions & 25 deletions

File tree

tests/model_serving/model_server/keda/conftest.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from tests.model_serving.model_runtime.vllm.utils import validate_supported_quantization_schema
1313
from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
1414
from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
15-
from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
1615

1716
from utilities.constants import (
1817
KServeDeploymentType,
@@ -36,6 +35,8 @@
3635
def create_keda_auto_scaling_config(
3736
query: str,
3837
target_value: str,
38+
model_name: str,
39+
namespace: Namespace,
3940
) -> dict[str, Any]:
4041
"""Create KEDA auto-scaling configuration for inference services.
4142
@@ -54,6 +55,7 @@ def create_keda_auto_scaling_config(
5455
"type": "External",
5556
"external": {
5657
"metric": {
58+
"namespace": namespace,
5759
"backend": "prometheus",
5860
"serverAddress": THANOS_QUERIER_ADDRESS,
5961
"query": query,
@@ -144,7 +146,7 @@ def stressed_keda_vllm_inference_service(
144146

145147
isvc_kwargs["auto_scaling"] = create_keda_auto_scaling_config(
146148
query=request.param.get("metrics_query"),
147-
model_name=request.param["name"],
149+
model_name=request.param["model-name"],
148150
namespace=model_namespace.name,
149151
target_value=str(request.param.get("metrics_threshold")),
150152
)
@@ -165,7 +167,7 @@ def stressed_ovms_keda_inference_service(
165167
unprivileged_client: DynamicClient,
166168
unprivileged_model_namespace: Namespace,
167169
ovms_kserve_serving_runtime: ServingRuntime,
168-
models_endpoint_s3_secret: Secret,
170+
ci_endpoint_s3_secret: Secret,
169171
) -> Generator[InferenceService, Any, Any]:
170172
model_name = f"{request.param['name']}-raw"
171173
with create_isvc(
@@ -175,7 +177,7 @@ def stressed_ovms_keda_inference_service(
175177
external_route=True,
176178
runtime=ovms_kserve_serving_runtime.name,
177179
storage_path=request.param["model-dir"],
178-
storage_key=models_endpoint_s3_secret.name,
180+
storage_key=ci_endpoint_s3_secret.name,
179181
model_format=ModelAndFormat.OPENVINO_IR,
180182
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
181183
model_version=request.param["model-version"],
@@ -189,11 +191,6 @@ def stressed_ovms_keda_inference_service(
189191
target_value=str(request.param["metrics_threshold"]),
190192
),
191193
) as isvc:
192-
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
193-
run_concurrent_load_for_keda_scaling(
194-
isvc=isvc,
195-
inference_config=ONNX_INFERENCE_CONFIG,
196-
)
197194
yield isvc
198195

199196

tests/model_serving/model_server/keda/test_isvc_keda_scaling_cpu.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44
from kubernetes.dynamic import DynamicClient
55
from ocp_resources.namespace import Namespace
66
from ocp_resources.inference_service import InferenceService
7-
from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
7+
from tests.model_serving.model_server.utils import (
8+
verify_keda_scaledobject,
9+
verify_final_pod_count,
10+
run_inference_multiple_times,
11+
)
812
from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
913
from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import SERVING_ARGUMENT
10-
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs
14+
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs, Protocols, Timeout
1115
from utilities.monitoring import validate_metrics_field
16+
from utilities.inference_utils import Inference
17+
from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
1218

1319
LOGGER = get_logger(name=__name__)
1420

@@ -21,14 +27,9 @@
2127
OVMS_MODEL_NAMESPACE = "ovms-keda"
2228
OVMS_MODEL_NAME = "onnx-raw"
2329
OVMS_METRICS_QUERY = (
24-
f"sum by (name) (rate(ovms_inference_time_us_sum{{"
25-
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
26-
f"}}[5m])) / "
27-
f"sum by (name) (rate(ovms_inference_time_us_count{{"
28-
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
29-
f"}}[5m]))"
30+
f'sum(sum_over_time(ovms_requests_success{{namespace="{OVMS_MODEL_NAMESPACE}", name="{OVMS_MODEL_NAME}"}}[5m]))'
3031
)
31-
OVMS_METRICS_THRESHOLD = 200
32+
OVMS_METRICS_THRESHOLD = 2.0
3233

3334
pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("valid_aws_config")]
3435

@@ -43,6 +44,7 @@
4344
"name": ModelFormat.ONNX,
4445
"model-version": ModelVersion.OPSET13,
4546
"model-dir": "test-dir",
47+
"model-name": OVMS_MODEL_NAME,
4648
"initial_pod_count": INITIAL_POD_COUNT,
4749
"final_pod_count": FINAL_POD_COUNT,
4850
"metrics_query": OVMS_METRICS_QUERY,
@@ -65,13 +67,24 @@ def test_ovms_keda_scaling_verify_scaledobject(
6567
ovms_kserve_serving_runtime,
6668
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
6769
):
70+
"""Test KEDA ScaledObject configuration and run inference multiple times to trigger scaling."""
6871
verify_keda_scaledobject(
6972
client=unprivileged_client,
7073
isvc=stressed_ovms_keda_inference_service,
7174
expected_trigger_type="prometheus",
7275
expected_query=OVMS_METRICS_QUERY,
7376
expected_threshold=OVMS_METRICS_THRESHOLD,
7477
)
78+
# Run inference multiple times to test KEDA scaling
79+
run_inference_multiple_times(
80+
isvc=stressed_ovms_keda_inference_service,
81+
inference_config=ONNX_INFERENCE_CONFIG,
82+
inference_type=Inference.INFER,
83+
protocol=Protocols.HTTPS,
84+
model_name=OVMS_MODEL_NAME,
85+
iterations=10,
86+
run_in_parallel=True,
87+
)
7588

7689
def test_ovms_keda_scaling_verify_metrics(
7790
self,
@@ -81,11 +94,13 @@ def test_ovms_keda_scaling_verify_metrics(
8194
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
8295
prometheus,
8396
):
97+
"""Test that OVMS metrics are available and above the expected threshold."""
8498
validate_metrics_field(
8599
prometheus=prometheus,
86100
metrics_query=OVMS_METRICS_QUERY,
87-
expected_value=str(OVMS_METRICS_THRESHOLD),
101+
expected_value=OVMS_METRICS_THRESHOLD,
88102
greater_than=True,
103+
timeout=Timeout.TIMEOUT_5MIN,
89104
)
90105

91106
def test_ovms_keda_scaling_verify_final_pod_count(
@@ -95,6 +110,7 @@ def test_ovms_keda_scaling_verify_final_pod_count(
95110
ovms_kserve_serving_runtime,
96111
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
97112
):
113+
"""Test that pods scale up to the expected count after load generation."""
98114
verify_final_pod_count(
99115
unprivileged_client=unprivileged_client,
100116
isvc=stressed_ovms_keda_inference_service,

tests/model_serving/model_server/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,9 @@ def verify_keda_scaledobject(
258258
f"Trigger type {trigger_type} does not match expected {expected_trigger_type}"
259259
)
260260
assert query == expected_query, f"Query {query} does not match expected {expected_query}"
261-
assert threshold == expected_threshold, f"Threshold {threshold} does not match expected {expected_threshold}"
261+
assert int(float(threshold)) == int(float(expected_threshold)), (
262+
f"Threshold {threshold} does not match expected {expected_threshold}"
263+
)
262264

263265

264266
def run_concurrent_load_for_keda_scaling(
@@ -327,7 +329,6 @@ def verify_final_pod_count(unprivileged_client: DynamicClient, isvc: InferenceSe
327329
timeout=Timeout.TIMEOUT_5MIN,
328330
sleep=10,
329331
):
330-
if pods:
331-
assert len(pods) == final_pod_count, (
332-
f"Final pod count {len(pods)} does not match expected {final_pod_count}"
333-
)
332+
if pods and len(pods) == final_pod_count:
333+
return
334+
raise AssertionError(f"Timed out waiting for {final_pod_count} pods. Current pod count: {len(pods) if pods else 0}")

utilities/infra.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1022,7 +1022,7 @@ def get_isvc_keda_scaledobject(client: DynamicClient, isvc: InferenceService) ->
10221022
scaled_object = scaled_object_client.get(namespace=namespace, name=isvc.name + "-predictor")
10231023

10241024
if scaled_object:
1025-
return scaled_object
1025+
return [scaled_object]
10261026
raise ResourceNotFoundError(f"{isvc.name} has no KEDA ScaledObjects")
10271027

10281028

0 commit comments

Comments
 (0)