opendatahub-io
diff --git a/‎tests/model_serving/model_server/conftest.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/model_serving/model_server/conftest.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/serverless/conftest.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/model_serving/model_server/serverless/conftest.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/serverless/test_canary_rollout.py‎
Lines changed: 99 additions & 0 deletions b/‎tests/model_serving/model_server/serverless/test_canary_rollout.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/serverless/test_concurrency_auto_scale.py‎
Lines changed: 71 additions & 0 deletions b/‎tests/model_serving/model_server/serverless/test_concurrency_auto_scale.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎tests/model_serving/model_server/serverless/utils.py‎
Lines changed: 121 additions & 0 deletions b/‎tests/model_serving/model_server/serverless/utils.py‎
Lines changed: 121 additions & 0 deletions
@@ -148,6 +148,12 @@ def s3_models_inference_service(
     if (enable_auth := request.param.get("enable-auth")) is not None:
         isvc_kwargs["enable_auth"] = enable_auth
 
+    if (scale_metric := request.param.get("scale-metric")) is not None:
+        isvc_kwargs["scale_metric"] = scale_metric
+
+    if (scale_target := request.param.get("scale-target")) is not None:
+        isvc_kwargs["scale_target"] = scale_target
+
     with create_isvc(**isvc_kwargs) as isvc:
         yield isvc
 
 
@@ -1,8 +1,16 @@
+from typing import Any, Generator
+
 import pytest
 from _pytest.fixtures import FixtureRequest
 from ocp_resources.inference_service import InferenceService
 from ocp_resources.resource import ResourceEditor
 
+from tests.model_serving.model_server.serverless.utils import wait_for_canary_rollout
+from tests.model_serving.model_server.utils import run_inference_multiple_times
+from utilities.constants import ModelFormat, Protocols
+from utilities.inference_utils import Inference
+from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
+
 
 @pytest.fixture(scope="class")
 def inference_service_patched_replicas(
@@ -19,3 +27,35 @@ def inference_service_patched_replicas(
     ).update()
 
     return ovms_serverless_inference_service
+
+
+@pytest.fixture
+def inference_service_updated_canary_config(
+    request: FixtureRequest, s3_models_inference_service: InferenceService
+) -> Generator[InferenceService, Any, Any]:
+    canary_percent = request.param["canary-traffic-percent"]
+    predictor_config = {
+        "spec": {
+            "predictor": {"canaryTrafficPercent": canary_percent},
+        }
+    }
+
+    if model_path := request.param.get("model-path"):
+        predictor_config["spec"]["predictor"]["model"] = {"storage": {"path": model_path}}
+
+    with ResourceEditor(patches={s3_models_inference_service: predictor_config}):
+        wait_for_canary_rollout(isvc=s3_models_inference_service, percentage=canary_percent)
+        yield s3_models_inference_service
+
+
+@pytest.fixture
+def multiple_tgis_inference_requests(s3_models_inference_service: InferenceService) -> None:
+    run_inference_multiple_times(
+        isvc=s3_models_inference_service,
+        inference_config=CAIKIT_TGIS_INFERENCE_CONFIG,
+        inference_type=Inference.ALL_TOKENS,
+        protocol=Protocols.HTTPS,
+        model_name=ModelFormat.CAIKIT,
+        iterations=50,
+        run_in_parallel=True,
+    )
@@ -0,0 +1,99 @@
+import pytest
+
+from tests.model_serving.model_server.serverless.utils import verify_canary_traffic
+from tests.model_serving.model_server.utils import verify_inference_response
+from utilities.constants import (
+    KServeDeploymentType,
+    ModelAndFormat,
+    ModelName,
+    ModelStoragePath,
+    Protocols,
+    RuntimeTemplates,
+)
+from utilities.inference_utils import Inference
+from utilities.manifests.pytorch import PYTORCH_TGIS_INFERENCE_CONFIG
+from utilities.manifests.tgis_grpc import TGIS_INFERENCE_CONFIG
+
+pytestmark = [pytest.mark.serverless, pytest.mark.sanity]
+
+
+@pytest.mark.polarion("ODS-2371")
+@pytest.mark.parametrize(
+    "model_namespace, serving_runtime_from_template, s3_models_inference_service",
+    [
+        pytest.param(
+            {"name": "serverless-canary-rollout"},
+            {
+                "name": "tgis-runtime",
+                "template-name": RuntimeTemplates.TGIS_GRPC_SERVING,
+                "multi-model": False,
+                "enable-http": False,
+                "enable-grpc": True,
+            },
+            {
+                "name": f"{ModelName.BLOOM_560M}-model",
+                "deployment-mode": KServeDeploymentType.SERVERLESS,
+                "model-dir": f"{ModelStoragePath.BLOOM_560M_CAIKIT}/artifacts",
+                "external-route": True,
+            },
+        )
+    ],
+    indirect=True,
+)
+class TestServerlessCanaryRollout:
+    def test_serverless_before_model_update(
+        self,
+        s3_models_inference_service,
+    ):
+        """Test inference with Bloom before model is updated."""
+        verify_inference_response(
+            inference_service=s3_models_inference_service,
+            inference_config=PYTORCH_TGIS_INFERENCE_CONFIG,
+            inference_type=Inference.ALL_TOKENS,
+            protocol=Protocols.GRPC,
+            model_name=ModelAndFormat.BLOOM_560M_CAIKIT,
+            use_default_query=True,
+        )
+
+    @pytest.mark.parametrize(
+        "inference_service_updated_canary_config",
+        [
+            pytest.param(
+                {"canary-traffic-percent": 30, "model-path": ModelStoragePath.FLAN_T5_SMALL_HF},
+            )
+        ],
+        indirect=True,
+    )
+    def test_serverless_during_canary_rollout(self, inference_service_updated_canary_config):
+        """Test inference during canary rollout"""
+        verify_canary_traffic(
+            isvc=inference_service_updated_canary_config,
+            inference_config=TGIS_INFERENCE_CONFIG,
+            model_name=ModelAndFormat.FLAN_T5_SMALL_CAIKIT,
+            inference_type=Inference.ALL_TOKENS,
+            protocol=Protocols.GRPC,
+            iterations=20,
+            expected_percentage=30,
+            tolerance=10,
+        )
+
+    @pytest.mark.parametrize(
+        "inference_service_updated_canary_config",
+        [
+            pytest.param(
+                {"canary-traffic-percent": 100},
+            )
+        ],
+        indirect=True,
+    )
+    def test_serverless_after_canary_rollout(self, inference_service_updated_canary_config):
+        """Test inference after canary rollout"""
+        verify_canary_traffic(
+            isvc=inference_service_updated_canary_config,
+            inference_config=TGIS_INFERENCE_CONFIG,
+            model_name=ModelAndFormat.FLAN_T5_SMALL_CAIKIT,
+            inference_type=Inference.ALL_TOKENS,
+            protocol=Protocols.GRPC,
+            iterations=5,
+            expected_percentage=100,
+        )
@@ -0,0 +1,71 @@
+import pytest
+
+from tests.model_serving.model_server.serverless.utils import (
+    inference_service_pods_sampler,
+)
+from utilities.constants import (
+    KServeDeploymentType,
+    ModelFormat,
+    ModelInferenceRuntime,
+    ModelStoragePath,
+    RuntimeTemplates,
+    Timeout,
+)
+
+pytestmark = [
+    pytest.mark.serverless,
+    pytest.mark.sanity,
+    pytest.mark.usefixtures("valid_aws_config"),
+]
+
+
+@pytest.mark.parametrize(
+    "model_namespace, serving_runtime_from_template, s3_models_inference_service",
+    [
+        pytest.param(
+            {"name": "serverless-auto-scale"},
+            {
+                "name": f"{ModelInferenceRuntime.CAIKIT_TGIS_RUNTIME}",
+                "template-name": RuntimeTemplates.CAIKIT_TGIS_SERVING,
+                "multi-model": False,
+                "enable-http": True,
+            },
+            {
+                "name": f"{ModelFormat.CAIKIT}-auto-scale",
+                "deployment-mode": KServeDeploymentType.SERVERLESS,
+                "model-dir": ModelStoragePath.FLAN_T5_SMALL_CAIKIT,
+                "scale-metric": "concurrency",
+                "scale-target": 1,
+            },
+        )
+    ],
+    indirect=True,
+)
+class TestConcurrencyAutoScale:
+    @pytest.mark.dependency(name="test_auto_scale_using_concurrency")
+    def test_auto_scale_using_concurrency(
+        self,
+        admin_client,
+        s3_models_inference_service,
+        multiple_tgis_inference_requests,
+    ):
+        """Verify model is successfully scaled up based on concurrency metrics (KPA)"""
+        for pods in inference_service_pods_sampler(
+            client=admin_client,
+            isvc=s3_models_inference_service,
+            timeout=Timeout.TIMEOUT_1MIN,
+        ):
+            if pods:
+                if len(pods) > 1 and all([pod.status == pod.Status.RUNNING for pod in pods]):
+                    return
+
+    @pytest.mark.dependency(requires=["test_auto_scale_using_concurrency"])
+    def test_pods_scaled_down_when_no_requests(self, admin_client, s3_models_inference_service):
+        """Verify auto-scaled pods are deleted when there are no inference requests"""
+        for pods in inference_service_pods_sampler(
+            client=admin_client,
+            isvc=s3_models_inference_service,
+            timeout=Timeout.TIMEOUT_4MIN,
+        ):
+            if pods and len(pods) == 1:
+                return
@@ -1,9 +1,16 @@
+from __future__ import annotations
+
+from typing import Any
+
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.inference_service import InferenceService
 from simple_logger.logger import get_logger
 from timeout_sampler import TimeoutSampler
+from timeout_sampler import TimeoutExpiredError
 
+from tests.model_serving.model_server.utils import verify_inference_response
 from utilities.constants import Timeout
+from utilities.exceptions import InferenceCanaryTrafficError
 from utilities.infra import get_pods_by_isvc_label
 
 
@@ -38,3 +45,117 @@ def verify_no_inference_pods(client: DynamicClient, isvc: InferenceService) -> N
     except TimeoutError:
         LOGGER.error(f"{[pod.name for pod in pods]} were not deleted")
         raise
+
+
+def wait_for_canary_rollout(isvc: InferenceService, percentage: int, timeout: int = Timeout.TIMEOUT_5MIN) -> None:
+    """
+    Wait for inference service to be updated with canary rollout.
+
+    Args:
+        isvc (InferenceService): InferenceService object
+        percentage (int): Percentage of canary rollout
+        timeout (int): Timeout in seconds
+
+    Raises:
+        TimeoutExpired: If canary rollout is not updated
+
+    """
+    sample = None
+
+    try:
+        for sample in TimeoutSampler(
+            wait_timeout=timeout,
+            sleep=5,
+            func=lambda: isvc.instance.status.components.predictor.get("traffic", []),
+        ):
+            if sample:
+                for traffic_info in sample:
+                    if traffic_info.get("latestRevision") and traffic_info.get("percent") == percentage:
+                        return
+
+    except TimeoutExpiredError:
+        LOGGER.error(
+            f"InferenceService {isvc.name} canary rollout is not updated to {percentage}. Traffic info:\n{sample}"
+        )
+        raise
+
+
+def verify_canary_traffic(
+    isvc: InferenceService,
+    inference_config: dict[str, Any],
+    inference_type: str,
+    protocol: str,
+    model_name: str,
+    iterations: int,
+    expected_percentage: int,
+    tolerance: int = 0,
+) -> None:
+    """
+    Verify canary traffic percentage against inference_config.
+
+    Args:
+        isvc (InferenceService): Inference service.
+        inference_config (dict[str, Any]): Inference config.
+        inference_type (str): Inference type.
+        protocol (str): Protocol.
+        model_name (str): Model name.
+        iterations (int): Number of iterations.
+        expected_percentage (int): Percentage of canary rollout.
+        tolerance (int): Tolerance of traffic percentage distribution;
+            difference between actual and expected percentage.
+
+    Raises:
+        InferenceCanaryTrafficError: If canary rollout is not updated
+
+    """
+    successful_inferences = 0
+
+    for iteration in range(iterations):
+        try:
+            verify_inference_response(
+                inference_service=isvc,
+                inference_config=inference_config,
+                inference_type=inference_type,
+                protocol=protocol,
+                model_name=model_name,
+                use_default_query=True,
+            )
+            LOGGER.info(f"Successful inference. Iteration: {iteration + 1}")
+
+            successful_inferences += 1
+
+        except Exception as ex:
+            LOGGER.warning(f"Inference failed. Error: {ex}. Previous model was used.")
+
+    LOGGER.info(f"Number of inference requests to the new model: {successful_inferences}")
+    successful_inferences_percentage = successful_inferences / iterations * 100
+
+    diff_percentage = abs(expected_percentage - successful_inferences_percentage)
+
+    if successful_inferences == 0 or diff_percentage > tolerance:
+        raise InferenceCanaryTrafficError(
+            f"Percentage of inference requests {successful_inferences_percentage} "
+            f"to the new model does not match the expected percentage {expected_percentage}. "
+        )
+
+
+def inference_service_pods_sampler(client: DynamicClient, isvc: InferenceService, timeout: int) -> TimeoutSampler:
+    """
+    Returns TimeoutSampler for inference service.
+
+    Args:
+        client (DynamicClient): DynamicClient object
+        isvc (InferenceService): InferenceService object
+        timeout (int): Timeout in seconds
+
+    Returns:
+        TimeoutSampler: TimeoutSampler object
+
+    """
+    return TimeoutSampler(
+        wait_timeout=timeout,
+        sleep=1,
+        func=get_pods_by_isvc_label,
+        client=client,
+        isvc=isvc,
+    )