Merge branch 'main' into oci_registry

fege · web-flow · commit 2beeda7dd392 · 2025-08-06T10:23:18.000+02:00
diff --git a/tests/model_serving/model_server/keda/conftest.py b/tests/model_serving/model_server/keda/conftest.py
@@ -12,7 +12,6 @@
 from tests.model_serving.model_runtime.vllm.utils import validate_supported_quantization_schema
 from tests.model_serving.model_runtime.vllm.constant import ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES, TEMPLATE_MAP
 from utilities.manifests.vllm import VLLM_INFERENCE_CONFIG
-from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
 
 from utilities.constants import (
     KServeDeploymentType,
@@ -36,6 +35,8 @@
 def create_keda_auto_scaling_config(
     query: str,
     target_value: str,
+    model_name: str,
+    namespace: Namespace,
 ) -> dict[str, Any]:
     """Create KEDA auto-scaling configuration for inference services.
 
@@ -54,6 +55,7 @@ def create_keda_auto_scaling_config(
                 "type": "External",
                 "external": {
                     "metric": {
+                        "namespace": namespace,
                         "backend": "prometheus",
                         "serverAddress": THANOS_QUERIER_ADDRESS,
                         "query": query,
@@ -144,7 +146,7 @@ def stressed_keda_vllm_inference_service(
 
     isvc_kwargs["auto_scaling"] = create_keda_auto_scaling_config(
         query=request.param.get("metrics_query"),
-        model_name=request.param["name"],
+        model_name=request.param["model-name"],
         namespace=model_namespace.name,
         target_value=str(request.param.get("metrics_threshold")),
     )
@@ -165,7 +167,7 @@ def stressed_ovms_keda_inference_service(
     unprivileged_client: DynamicClient,
     unprivileged_model_namespace: Namespace,
     ovms_kserve_serving_runtime: ServingRuntime,
-    models_endpoint_s3_secret: Secret,
+    ci_endpoint_s3_secret: Secret,
 ) -> Generator[InferenceService, Any, Any]:
     model_name = f"{request.param['name']}-raw"
     with create_isvc(
@@ -175,7 +177,7 @@ def stressed_ovms_keda_inference_service(
         external_route=True,
         runtime=ovms_kserve_serving_runtime.name,
         storage_path=request.param["model-dir"],
-        storage_key=models_endpoint_s3_secret.name,
+        storage_key=ci_endpoint_s3_secret.name,
         model_format=ModelAndFormat.OPENVINO_IR,
         deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
         model_version=request.param["model-version"],
@@ -189,11 +191,6 @@ def stressed_ovms_keda_inference_service(
             target_value=str(request.param["metrics_threshold"]),
         ),
     ) as isvc:
-        isvc.wait_for_condition(condition=isvc.Condition.READY, status="True")
-        run_concurrent_load_for_keda_scaling(
-            isvc=isvc,
-            inference_config=ONNX_INFERENCE_CONFIG,
-        )
         yield isvc
 
 
diff --git a/tests/model_serving/model_server/keda/test_isvc_keda_scaling_cpu.py b/tests/model_serving/model_server/keda/test_isvc_keda_scaling_cpu.py
@@ -4,11 +4,18 @@
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.namespace import Namespace
 from ocp_resources.inference_service import InferenceService
-from tests.model_serving.model_server.utils import verify_keda_scaledobject, verify_final_pod_count
+from tests.model_serving.model_server.utils import (
+    verify_keda_scaledobject,
+    verify_final_pod_count,
+    run_inference_multiple_times,
+)
 from tests.model_serving.model_runtime.vllm.constant import BASE_RAW_DEPLOYMENT_CONFIG
 from tests.model_serving.model_runtime.vllm.basic_model_deployment.test_granite_7b_starter import SERVING_ARGUMENT
-from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs
+from utilities.constants import ModelFormat, ModelVersion, RunTimeConfigs, Protocols, Timeout
 from utilities.monitoring import validate_metrics_field
+from utilities.inference_utils import Inference
+from utilities.manifests.onnx import ONNX_INFERENCE_CONFIG
+from utilities.jira import is_jira_open
 
 LOGGER = get_logger(name=__name__)
 
@@ -21,14 +28,9 @@
 OVMS_MODEL_NAMESPACE = "ovms-keda"
 OVMS_MODEL_NAME = "onnx-raw"
 OVMS_METRICS_QUERY = (
-    f"sum by (name) (rate(ovms_inference_time_us_sum{{"
-    f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
-    f"}}[5m])) / "
-    f"sum by (name) (rate(ovms_inference_time_us_count{{"
-    f"namespace='{OVMS_MODEL_NAMESPACE}', name='{OVMS_MODEL_NAME}'"
-    f"}}[5m]))"
+    f'sum(sum_over_time(ovms_requests_success{{namespace="{OVMS_MODEL_NAMESPACE}", name="{OVMS_MODEL_NAME}"}}[5m]))'
 )
-OVMS_METRICS_THRESHOLD = 200
+OVMS_METRICS_THRESHOLD = 2.0
 
 pytestmark = [pytest.mark.keda, pytest.mark.usefixtures("valid_aws_config")]
 
@@ -43,6 +45,7 @@
                 "name": ModelFormat.ONNX,
                 "model-version": ModelVersion.OPSET13,
                 "model-dir": "test-dir",
+                "model-name": OVMS_MODEL_NAME,
                 "initial_pod_count": INITIAL_POD_COUNT,
                 "final_pod_count": FINAL_POD_COUNT,
                 "metrics_query": OVMS_METRICS_QUERY,
@@ -64,14 +67,42 @@ def test_ovms_keda_scaling_verify_scaledobject(
         unprivileged_client: DynamicClient,
         ovms_kserve_serving_runtime,
         stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
+        admin_client: DynamicClient,
     ):
+        """Test KEDA ScaledObject configuration and run inference multiple times to trigger scaling."""
+
+        if is_jira_open(jira_id="RHOAIENG-31386", admin_client=admin_client):
+            patch_operations = [
+                {
+                    "op": "add",
+                    "path": "/spec/predictor/autoScaling/metrics/0/external/authenticationRef",
+                    "value": {"authModes": "bearer", "authenticationRef": {"name": "inference-prometheus-auth"}},
+                }
+            ]
+            admin_client.resources.get(api_version="v1beta1", kind="InferenceService").patch(
+                name=stressed_ovms_keda_inference_service.name,
+                namespace=stressed_ovms_keda_inference_service.namespace,
+                body=patch_operations,
+                content_type="application/json-patch+json",
+            )
+
         verify_keda_scaledobject(
             client=unprivileged_client,
             isvc=stressed_ovms_keda_inference_service,
             expected_trigger_type="prometheus",
             expected_query=OVMS_METRICS_QUERY,
             expected_threshold=OVMS_METRICS_THRESHOLD,
         )
+        # Run inference multiple times to test KEDA scaling
+        run_inference_multiple_times(
+            isvc=stressed_ovms_keda_inference_service,
+            inference_config=ONNX_INFERENCE_CONFIG,
+            inference_type=Inference.INFER,
+            protocol=Protocols.HTTPS,
+            model_name=OVMS_MODEL_NAME,
+            iterations=10,
+            run_in_parallel=True,
+        )
 
     def test_ovms_keda_scaling_verify_metrics(
         self,
@@ -81,11 +112,13 @@ def test_ovms_keda_scaling_verify_metrics(
         stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
         prometheus,
     ):
+        """Test that OVMS metrics are available and above the expected threshold."""
         validate_metrics_field(
             prometheus=prometheus,
             metrics_query=OVMS_METRICS_QUERY,
-            expected_value=str(OVMS_METRICS_THRESHOLD),
+            expected_value=OVMS_METRICS_THRESHOLD,
             greater_than=True,
+            timeout=Timeout.TIMEOUT_5MIN,
         )
 
     def test_ovms_keda_scaling_verify_final_pod_count(
@@ -95,6 +128,7 @@ def test_ovms_keda_scaling_verify_final_pod_count(
         ovms_kserve_serving_runtime,
         stressed_ovms_keda_inference_service: Generator[InferenceService, Any, Any],
     ):
+        """Test that pods scale up to the expected count after load generation."""
         verify_final_pod_count(
             unprivileged_client=unprivileged_client,
             isvc=stressed_ovms_keda_inference_service,
diff --git a/tests/model_serving/model_server/utils.py b/tests/model_serving/model_server/utils.py
@@ -258,7 +258,9 @@ def verify_keda_scaledobject(
         f"Trigger type {trigger_type} does not match expected {expected_trigger_type}"
     )
     assert query == expected_query, f"Query {query} does not match expected {expected_query}"
-    assert threshold == expected_threshold, f"Threshold {threshold} does not match expected {expected_threshold}"
+    assert int(float(threshold)) == int(float(expected_threshold)), (
+        f"Threshold {threshold} does not match expected {expected_threshold}"
+    )
 
 
 def run_concurrent_load_for_keda_scaling(
@@ -327,7 +329,6 @@ def verify_final_pod_count(unprivileged_client: DynamicClient, isvc: InferenceSe
         timeout=Timeout.TIMEOUT_5MIN,
         sleep=10,
     ):
-        if pods:
-            assert len(pods) == final_pod_count, (
-                f"Final pod count {len(pods)} does not match expected {final_pod_count}"
-            )
+        if pods and len(pods) == final_pod_count:
+            return
+    raise AssertionError(f"Timed out waiting for {final_pod_count} pods. Current pod count: {len(pods) if pods else 0}")
diff --git a/utilities/infra.py b/utilities/infra.py
@@ -1022,7 +1022,7 @@ def get_isvc_keda_scaledobject(client: DynamicClient, isvc: InferenceService) ->
     scaled_object = scaled_object_client.get(namespace=namespace, name=isvc.name + "-predictor")
 
     if scaled_object:
-        return scaled_object
+        return [scaled_object]
     raise ResourceNotFoundError(f"{isvc.name} has no KEDA ScaledObjects")