Skip to content

Commit 2beeda7

Browse files
authored
Merge branch 'main' into oci_registry
2 parents 1495f2c + d0127ef commit 2beeda7

File tree

4 files changed

+57
-25
lines changed

4 files changed

+57
-25
lines changed

tests/model_serving/model_server/keda/conftest.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from tests.model_serving.model_runtime.vllm.utils import validate_supported_quantization_schema
1313
from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
1414
from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
15-
from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
1615

1716
from utilities.constants import (
1817
KServeDeploymentType,
@@ -36,6 +35,8 @@
3635
def create_keda_auto_scaling_config(
3736
query: str,
3837
target_value: str,
38+
model_name: str,
39+
namespace: Namespace,
3940
) -> dict[str, Any]:
4041
"""Create KEDA auto-scaling configuration for inference services.
4142
@@ -54,6 +55,7 @@ def create_keda_auto_scaling_config(
5455
"type": "External",
5556
"external": {
5657
"metric": {
58+
"namespace": namespace,
5759
"backend": "prometheus",
5860
"serverAddress": THANOS_QUERIER_ADDRESS,
5961
"query": query,
@@ -144,7 +146,7 @@ def stressed_keda_vllm_inference_service(
144146

145147
isvc_kwargs["auto_scaling"] = create_keda_auto_scaling_config(
146148
query=request.param.get("metrics_query"),
147-
model_name=request.param["name"],
149+
model_name=request.param["model-name"],
148150
namespace=model_namespace.name,
149151
target_value=str(request.param.get("metrics_threshold")),
150152
)
@@ -165,7 +167,7 @@ def stressed_ovms_keda_inference_service(
165167
unprivileged_client: DynamicClient,
166168
unprivileged_model_namespace: Namespace,
167169
ovms_kserve_serving_runtime: ServingRuntime,
168-
models_endpoint_s3_secret: Secret,
170+
ci_endpoint_s3_secret: Secret,
169171
) -> Generator[InferenceService, Any, Any]:
170172
model_name = f"{request.param['name']}-raw"
171173
with create_isvc(
@@ -175,7 +177,7 @@ def stressed_ovms_keda_inference_service(
175177
external_route=True,
176178
runtime=ovms_kserve_serving_runtime.name,
177179
storage_path=request.param["model-dir"],
178-
storage_key=models_endpoint_s3_secret.name,
180+
storage_key=ci_endpoint_s3_secret.name,
179181
model_format=ModelAndFormat.OPENVINO_IR,
180182
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
181183
model_version=request.param["model-version"],
@@ -189,11 +191,6 @@ def stressed_ovms_keda_inference_service(
189191
target_value=str(request.param["metrics_threshold"]),
190192
),
191193
) as isvc:
192-
isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
193-
run_concurrent_load_for_keda_scaling(
194-
isvc=isvc,
195-
inference_config=ONNX_INFERENCE_CONFIG,
196-
)
197194
yield isvc
198195

199196

tests/model_serving/model_server/keda/test_isvc_keda_scaling_cpu.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,18 @@
44
from kubernetes.dynamic import DynamicClient
55
from ocp_resources.namespace import Namespace
66
from ocp_resources.inference_service import InferenceService
7-
from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
7+
from tests.model_serving.model_server.utils import (
8+
verify_keda_scaledobject,
9+
verify_final_pod_count,
10+
run_inference_multiple_times,
11+
)
812
from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
913
from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import SERVING_ARGUMENT
10-
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs
14+
from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs, Protocols, Timeout
1115
from utilities.monitoring import validate_metrics_field
16+
from utilities.inference_utils import Inference
17+
from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
18+
from utilities.jira import is_jira_open
1219

1320
LOGGER = get_logger(name=__name__)
1421

@@ -21,14 +28,9 @@
2128
OVMS_MODEL_NAMESPACE = "ovms-keda"
2229
OVMS_MODEL_NAME = "onnx-raw"
2330
OVMS_METRICS_QUERY = (
24-
f"sum by (name) (rate(ovms_inference_time_us_sum{{"
25-
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
26-
f"}}[5m])) / "
27-
f"sum by (name) (rate(ovms_inference_time_us_count{{"
28-
f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
29-
f"}}[5m]))"
31+
f'sum(sum_over_time(ovms_requests_success{{namespace="{OVMS_MODEL_NAMESPACE}", name="{OVMS_MODEL_NAME}"}}[5m]))'
3032
)
31-
OVMS_METRICS_THRESHOLD = 200
33+
OVMS_METRICS_THRESHOLD = 2.0
3234

3335
pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("valid_aws_config")]
3436

@@ -43,6 +45,7 @@
4345
"name": ModelFormat.ONNX,
4446
"model-version": ModelVersion.OPSET13,
4547
"model-dir": "test-dir",
48+
"model-name": OVMS_MODEL_NAME,
4649
"initial_pod_count": INITIAL_POD_COUNT,
4750
"final_pod_count": FINAL_POD_COUNT,
4851
"metrics_query": OVMS_METRICS_QUERY,
@@ -64,14 +67,42 @@ def test_ovms_keda_scaling_verify_scaledobject(
6467
unprivileged_client: DynamicClient,
6568
ovms_kserve_serving_runtime,
6669
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
70+
admin_client: DynamicClient,
6771
):
72+
"""Test KEDA ScaledObject configuration and run inference multiple times to trigger scaling."""
73+
74+
if is_jira_open(jira_id="RHOAIENG-31386", admin_client=admin_client):
75+
patch_operations = [
76+
{
77+
"op": "add",
78+
"path": "/spec/predictor/autoScaling/metrics/0/external/authenticationRef",
79+
"value": {"authModes": "bearer", "authenticationRef": {"name": "inference-prometheus-auth"}},
80+
}
81+
]
82+
admin_client.resources.get(api_version="v1beta1", kind="InferenceService").patch(
83+
name=stressed_ovms_keda_inference_service.name,
84+
namespace=stressed_ovms_keda_inference_service.namespace,
85+
body=patch_operations,
86+
content_type="application/json-patch+json",
87+
)
88+
6889
verify_keda_scaledobject(
6990
client=unprivileged_client,
7091
isvc=stressed_ovms_keda_inference_service,
7192
expected_trigger_type="prometheus",
7293
expected_query=OVMS_METRICS_QUERY,
7394
expected_threshold=OVMS_METRICS_THRESHOLD,
7495
)
96+
# Run inference multiple times to test KEDA scaling
97+
run_inference_multiple_times(
98+
isvc=stressed_ovms_keda_inference_service,
99+
inference_config=ONNX_INFERENCE_CONFIG,
100+
inference_type=Inference.INFER,
101+
protocol=Protocols.HTTPS,
102+
model_name=OVMS_MODEL_NAME,
103+
iterations=10,
104+
run_in_parallel=True,
105+
)
75106

76107
def test_ovms_keda_scaling_verify_metrics(
77108
self,
@@ -81,11 +112,13 @@ def test_ovms_keda_scaling_verify_metrics(
81112
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
82113
prometheus,
83114
):
115+
"""Test that OVMS metrics are available and above the expected threshold."""
84116
validate_metrics_field(
85117
prometheus=prometheus,
86118
metrics_query=OVMS_METRICS_QUERY,
87-
expected_value=str(OVMS_METRICS_THRESHOLD),
119+
expected_value=OVMS_METRICS_THRESHOLD,
88120
greater_than=True,
121+
timeout=Timeout.TIMEOUT_5MIN,
89122
)
90123

91124
def test_ovms_keda_scaling_verify_final_pod_count(
@@ -95,6 +128,7 @@ def test_ovms_keda_scaling_verify_final_pod_count(
95128
ovms_kserve_serving_runtime,
96129
stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
97130
):
131+
"""Test that pods scale up to the expected count after load generation."""
98132
verify_final_pod_count(
99133
unprivileged_client=unprivileged_client,
100134
isvc=stressed_ovms_keda_inference_service,

tests/model_serving/model_server/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,9 @@ def verify_keda_scaledobject(
258258
f"Trigger type {trigger_type} does not match expected {expected_trigger_type}"
259259
)
260260
assert query == expected_query, f"Query {query} does not match expected {expected_query}"
261-
assert threshold == expected_threshold, f"Threshold {threshold} does not match expected {expected_threshold}"
261+
assert int(float(threshold)) == int(float(expected_threshold)), (
262+
f"Threshold {threshold} does not match expected {expected_threshold}"
263+
)
262264

263265

264266
def run_concurrent_load_for_keda_scaling(
@@ -327,7 +329,6 @@ def verify_final_pod_count(unprivileged_client: DynamicClient, isvc: InferenceSe
327329
timeout=Timeout.TIMEOUT_5MIN,
328330
sleep=10,
329331
):
330-
if pods:
331-
assert len(pods) == final_pod_count, (
332-
f"Final pod count {len(pods)} does not match expected {final_pod_count}"
333-
)
332+
if pods and len(pods) == final_pod_count:
333+
return
334+
raise AssertionError(f"Timed out waiting for {final_pod_count} pods. Current pod count: {len(pods) if pods else 0}")

utilities/infra.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1022,7 +1022,7 @@ def get_isvc_keda_scaledobject(client: DynamicClient, isvc: InferenceService) ->
10221022
scaled_object = scaled_object_client.get(namespace=namespace, name=isvc.name + "-predictor")
10231023

10241024
if scaled_object:
1025-
return scaled_object
1025+
return [scaled_object]
10261026
raise ResourceNotFoundError(f"{isvc.name} has no KEDA ScaledObjects")
10271027

10281028

0 commit comments

Comments
 (0)