feat: add canary and concurency tests

rnetser · rnetser · commit 5b89ab6149f9 · 2025-03-10T20:55:51.000+01:00
diff --git a/tests/model_serving/model_server/conftest.py b/tests/model_serving/model_server/conftest.py
@@ -148,6 +148,12 @@ def s3_models_inference_service(
     if (enable_auth := request.param.get("enable-auth")) is not None:
         isvc_kwargs["enable_auth"] = enable_auth
 
+    if (scale_metric := request.param.get("scale-metric")) is not None:
+        isvc_kwargs["scale_metric"] = scale_metric
+
+    if (scale_target := request.param.get("scale-target")) is not None:
+        isvc_kwargs["scale_target"] = scale_target
+
     with create_isvc(**isvc_kwargs) as isvc:
         yield isvc
 
diff --git a/tests/model_serving/model_server/serverless/conftest.py b/tests/model_serving/model_server/serverless/conftest.py
@@ -6,22 +6,39 @@
 from ocp_resources.resource import ResourceEditor
 
 from tests.model_serving.model_server.serverless.utils import wait_for_canary_rollout
+from tests.model_serving.model_server.utils import run_inference_multiple_times
+from utilities.constants import ModelFormat, Protocols
+from utilities.inference_utils import Inference
+from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
 
 
 @pytest.fixture
 def inference_service_updated_canary_config(
     request: FixtureRequest, s3_models_inference_service: InferenceService
 ) -> Generator[InferenceService, Any, Any]:
-    percent = request.param("canary-traffic-percent")
+    canary_percent = request.param["canary-traffic-percent"]
     predictor_config = {
         "spec": {
-            "predictor": {"canaryTrafficPercent": percent},
+            "predictor": {"canaryTrafficPercent": canary_percent},
         }
     }
 
     if model_path := request.param.get("model-path"):
-        predictor_config["spec"]["predictor"]["model"]["storage_path"] = model_path
+        predictor_config["spec"]["predictor"]["model"] = {"storage": {"path": model_path}}
 
     with ResourceEditor(patches={s3_models_inference_service: predictor_config}):
-        wait_for_canary_rollout(isvc=s3_models_inference_service, percentage=percent)
+        wait_for_canary_rollout(isvc=s3_models_inference_service, percentage=canary_percent)
         yield s3_models_inference_service
+
+
+@pytest.fixture
+def multiple_tgis_inference_requests(s3_models_inference_service: InferenceService) -> None:
+    run_inference_multiple_times(
+        isvc=s3_models_inference_service,
+        inference_config=CAIKIT_TGIS_INFERENCE_CONFIG,
+        inference_type=Inference.ALL_TOKENS,
+        protocol=Protocols.HTTPS,
+        model_name=ModelFormat.CAIKIT,
+        iterations=50,
+        run_in_parallel=True,
+    )
diff --git a/tests/model_serving/model_server/serverless/test_canary_rollout.py b/tests/model_serving/model_server/serverless/test_canary_rollout.py
@@ -59,7 +59,7 @@ def test_serverless_before_model_update(
         "inference_service_updated_canary_config",
         [
             pytest.param(
-                {"canary-traffic-percent": "10", "model-path": ModelStoragePath.FLAN_T5_SMALL_HF},
+                {"canary-traffic-percent": 30, "model-path": ModelStoragePath.FLAN_T5_SMALL_HF},
             )
         ],
         indirect=True,
@@ -72,15 +72,16 @@ def test_serverless_during_canary_rollout(self, inference_service_updated_canary
             model_name=ModelAndFormat.FLAN_T5_SMALL_CAIKIT,
             inference_type=Inference.ALL_TOKENS,
             protocol=Protocols.GRPC,
-            iterations=5,
-            percentage=10,
+            iterations=20,
+            expected_percentage=30,
+            tolerance=10,
         )
 
     @pytest.mark.parametrize(
         "inference_service_updated_canary_config",
         [
             pytest.param(
-                {"canary-traffic-percent": "100"},
+                {"canary-traffic-percent": 100},
             )
         ],
         indirect=True,
@@ -94,5 +95,5 @@ def test_serverless_after_canary_rollout(self, inference_service_updated_canary_
             inference_type=Inference.ALL_TOKENS,
             protocol=Protocols.GRPC,
             iterations=5,
-            percentage=100,
+            expected_percentage=100,
         )
diff --git a/tests/model_serving/model_server/serverless/test_concurrency_auto_scale.py b/tests/model_serving/model_server/serverless/test_concurrency_auto_scale.py
@@ -0,0 +1,71 @@
+import pytest
+
+from tests.model_serving.model_server.serverless.utils import (
+    inference_service_pods_sampler,
+)
+from utilities.constants import (
+    KServeDeploymentType,
+    ModelFormat,
+    ModelInferenceRuntime,
+    ModelStoragePath,
+    RuntimeTemplates,
+    Timeout,
+)
+
+pytestmark = [
+    pytest.mark.serverless,
+    pytest.mark.sanity,
+    pytest.mark.usefixtures("valid_aws_config"),
+]
+
+
+@pytest.mark.parametrize(
+    "model_namespace, serving_runtime_from_template, s3_models_inference_service",
+    [
+        pytest.param(
+            {"name": "serverless-auto-scale"},
+            {
+                "name": f"{ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME}",
+                "template-name": RuntimeTemplates.CAIKIT_TGIS_SERVING,
+                "multi-model": False,
+                "enable-http": True,
+            },
+            {
+                "name": f"{ModelFormat.CAIKIT}-auto-scale",
+                "deployment-mode": KServeDeploymentType.SERVERLESS,
+                "model-dir": ModelStoragePath.FLAN_T5_SMALL_CAIKIT,
+                "scale-metric": "concurrency",
+                "scale-target": 1,
+            },
+        )
+    ],
+    indirect=True,
+)
+class TestConcurrencyAutoScale:
+    @pytest.mark.dependency(name="test_auto_scale_using_concurrency")
+    def test_auto_scale_using_concurrency(
+        self,
+        admin_client,
+        s3_models_inference_service,
+        multiple_tgis_inference_requests,
+    ):
+        """Verify model is successfully scaled up based on concurrency metrics (KPA)"""
+        for pods in inference_service_pods_sampler(
+            client=admin_client,
+            isvc=s3_models_inference_service,
+            timeout=Timeout.TIMEOUT_1MIN,
+        ):
+            if pods:
+                if len(pods) > 1 and all([pod.status == pod.Status.RUNNING for pod in pods]):
+                    return
+
+    @pytest.mark.dependency(requires=["test_auto_scale_using_concurrency"])
+    def test_pods_scaled_down_when_no_requests(self, admin_client, s3_models_inference_service):
+        """Verify auto-scaled pods are deleted when there are no inference requests"""
+        for pods in inference_service_pods_sampler(
+            client=admin_client,
+            isvc=s3_models_inference_service,
+            timeout=Timeout.TIMEOUT_4MIN,
+        ):
+            if pods and len(pods) == 1:
+                return
diff --git a/tests/model_serving/model_server/serverless/utils.py b/tests/model_serving/model_server/serverless/utils.py
@@ -1,12 +1,16 @@
+from __future__ import annotations
+
 from typing import Any
 
+from kubernetes.dynamic import DynamicClient
 from ocp_resources.inference_service import InferenceService
 from simple_logger.logger import get_logger
 from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 
 from tests.model_serving.model_server.utils import verify_inference_response
 from utilities.constants import Timeout
 from utilities.exceptions import InferenceCanaryTrafficError
+from utilities.infra import get_pods_by_isvc_label
 
 LOGGER = get_logger(name=__name__)
 
@@ -51,7 +55,8 @@ def verify_canary_traffic(
     protocol: str,
     model_name: str,
     iterations: int,
-    percentage: int,
+    expected_percentage: int,
+    tolerance: int = 0,
 ) -> None:
     """
     Verify canary traffic percentage against inference_config.
@@ -63,15 +68,17 @@ def verify_canary_traffic(
         protocol (str): Protocol.
         model_name (str): Model name.
         iterations (int): Number of iterations.
-        percentage (int): Percentage of canary rollout.
+        expected_percentage (int): Percentage of canary rollout.
+        tolerance (int): Tolerance of traffic percentage distribution;
+            difference between actual and expected percentage.
 
     Raises:
         InferenceCanaryTrafficError: If canary rollout is not updated
 
     """
     successful_inferences = 0
 
-    for _ in range(iterations):
+    for iteration in range(iterations):
         try:
             verify_inference_response(
                 inference_service=isvc,
@@ -81,16 +88,42 @@ def verify_canary_traffic(
                 model_name=model_name,
                 use_default_query=True,
             )
+            LOGGER.info(f"Successful inference. Iteration: {iteration + 1}")
 
             successful_inferences += 1
 
-        except Exception:
-            continue
+        except Exception as ex:
+            LOGGER.warning(f"Inference failed. Error: {ex}. Previous model was used.")
 
+    LOGGER.info(f"Number of inference requests to the new model: {successful_inferences}")
     successful_inferences_percentage = successful_inferences / iterations * 100
 
-    if successful_inferences_percentage != percentage:
+    diff_percentage = abs(expected_percentage - successful_inferences_percentage)
+
+    if successful_inferences == 0 or diff_percentage > tolerance:
         raise InferenceCanaryTrafficError(
             f"Percentage of inference requests {successful_inferences_percentage} "
-            f"to the new model does not match the expected percentage {percentage}. "
+            f"to the new model does not match the expected percentage {expected_percentage}. "
         )
+
+
+def inference_service_pods_sampler(client: DynamicClient, isvc: InferenceService, timeout: int) -> TimeoutSampler:
+    """
+    Returns TimeoutSampler for inference service.
+
+    Args:
+        client (DynamicClient): DynamicClient object
+        isvc (InferenceService): InferenceService object
+        timeout (int): Timeout in seconds
+
+    Returns:
+        TimeoutSampler: TimeoutSampler object
+
+    """
+    return TimeoutSampler(
+        wait_timeout=timeout,
+        sleep=1,
+        func=get_pods_by_isvc_label,
+        client=client,
+        isvc=isvc,
+    )
diff --git a/tests/model_serving/model_server/utils.py b/tests/model_serving/model_server/utils.py
@@ -132,25 +132,32 @@ def verify_inference_response(
                     res[inference.inference_response_key_name],
                     re.MULTILINE,
                 ):
-                    assert "".join(output) == expected_response_text
+                    assert "".join(output) == expected_response_text, (
+                        f"Expected: {expected_response_text} does not match response: {output}"
+                    )
 
             elif inference_type == inference.INFER or use_regex:
                 formatted_res = json.dumps(res[inference.inference_response_text_key_name]).replace(" ", "")
                 if use_regex:
-                    assert re.search(expected_response_text, formatted_res)  # type: ignore[arg-type]  # noqa: E501
+                    assert re.search(expected_response_text, formatted_res), (  # type: ignore[arg-type]  # noqa: E501
+                        f"Expected: {expected_response_text} not found in: {formatted_res}"
+                    )
 
                 else:
-                    assert (
-                        json.dumps(res[inference.inference_response_key_name]).replace(" ", "")
-                        == expected_response_text
+                    formatted_res = json.dumps(res[inference.inference_response_key_name]).replace(" ", "")
+                    assert formatted_res == expected_response_text, (
+                        f"Expected: {expected_response_text} does not match output: {formatted_res}"
                     )
 
             else:
                 response = res[inference.inference_response_key_name]
                 if isinstance(response, list):
                     response = response[0]
 
-                assert response[inference.inference_response_text_key_name] == expected_response_text
+                response_text = response[inference.inference_response_text_key_name]
+                assert response_text == expected_response_text, (
+                    f"Expected: {expected_response_text} does not mathc response: {response_text}"
+                )
 
         else:
             raise InferenceResponseError(f"Inference response output not found in response. Response: {res}")
diff --git a/utilities/inference_utils.py b/utilities/inference_utils.py
@@ -36,8 +36,6 @@
 )
 import portforward
 
-from utilities.jira import is_jira_open
-
 LOGGER = get_logger(name=__name__)
 
 
@@ -65,12 +63,12 @@ def get_deployment_type(self) -> str:
         Returns:
             deployment type
         """
-        deployment_type = self.inference_service.instance.metadata.annotations.get("serving.kserve.io/deploymentMode")
-
-        if is_jira_open(jira_id="RHOAIENG-16954", admin_client=get_client()) and not deployment_type:
-            return KServeDeploymentType.SERVERLESS
+        if deployment_type := self.inference_service.instance.metadata.annotations.get(
+            "serving.kserve.io/deploymentMode"
+        ):
+            return deployment_type
 
-        return deployment_type
+        return self.inference_service.instance.status.deploymentMode
 
     def get_inference_url(self) -> str:
         """
@@ -524,6 +522,8 @@ def create_isvc(
     autoscaler_mode: str | None = None,
     multi_node_worker_spec: dict[str, int] | None = None,
     timeout: int = Timeout.TIMEOUT_15MIN,
+    scale_metric: str | None = None,
+    scale_target: int | None = None,
 ) -> Generator[InferenceService, Any, Any]:
     """
     Create InferenceService object.
@@ -553,6 +553,8 @@ def create_isvc(
         multi_node_worker_spec (dict[str, int]): Multi node worker spec
         wait_for_predictor_pods (bool): Wait for predictor pods
         timeout (int): Time to wait for the model inference,deployment to be ready
+        scale_metric (str): Scale metric
+        scale_target (int): Scale target
 
     Yields:
         InferenceService: InferenceService object
@@ -625,6 +627,12 @@ def create_isvc(
     if multi_node_worker_spec is not None:
         predictor_dict["workerSpec"] = multi_node_worker_spec
 
+    if scale_metric is not None:
+        predictor_dict["scaleMetric"] = scale_metric
+
+    if scale_target is not None:
+        predictor_dict["scaleTarget"] = scale_target
+
     with InferenceService(
         client=client,
         name=name,
@@ -634,9 +642,17 @@ def create_isvc(
         label=labels,
     ) as inference_service:
         if wait_for_predictor_pods:
-            verify_no_failed_pods(client=client, isvc=inference_service, runtime_name=runtime, timeout=timeout)
+            verify_no_failed_pods(
+                client=client,
+                isvc=inference_service,
+                runtime_name=runtime,
+                timeout=timeout,
+            )
             wait_for_inference_deployment_replicas(
-                client=client, isvc=inference_service, runtime_name=runtime, timeout=timeout
+                client=client,
+                isvc=inference_service,
+                runtime_name=runtime,
+                timeout=timeout,
             )
 
         if wait:

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def test_serverless_before_model_update(`
`59`	`59`	`"inference_service_updated_canary_config",`
`60`	`60`	`[`
`61`	`61`	`pytest.param(`
`62`		`- {"canary-traffic-percent": "10", "model-path": ModelStoragePath.FLAN_T5_SMALL_HF},`
	`62`	`+ {"canary-traffic-percent": 30, "model-path": ModelStoragePath.FLAN_T5_SMALL_HF},`
`63`	`63`	`)`
`64`	`64`	`],`
`65`	`65`	`indirect=True,`
`@@ -72,15 +72,16 @@ def test_serverless_during_canary_rollout(self, inference_service_updated_canary`
`72`	`72`	`model_name=ModelAndFormat.FLAN_T5_SMALL_CAIKIT,`
`73`	`73`	`inference_type=Inference.ALL_TOKENS,`
`74`	`74`	`protocol=Protocols.GRPC,`
`75`		`- iterations=5,`
`76`		`- percentage=10,`
	`75`	`+ iterations=20,`
	`76`	`+ expected_percentage=30,`
	`77`	`+ tolerance=10,`
`77`	`78`	`)`
`78`	`79`
`79`	`80`	`@pytest.mark.parametrize(`
`80`	`81`	`"inference_service_updated_canary_config",`
`81`	`82`	`[`
`82`	`83`	`pytest.param(`
`83`		`- {"canary-traffic-percent": "100"},`
	`84`	`+ {"canary-traffic-percent": 100},`
`84`	`85`	`)`
`85`	`86`	`],`
`86`	`87`	`indirect=True,`
`@@ -94,5 +95,5 @@ def test_serverless_after_canary_rollout(self, inference_service_updated_canary_`
`94`	`95`	`inference_type=Inference.ALL_TOKENS,`
`95`	`96`	`protocol=Protocols.GRPC,`
`96`	`97`	`iterations=5,`
`97`		`- percentage=100,`
	`98`	`+ expected_percentage=100,`
`98`	`99`	`)`