add auto-scaling to ads deployments (#1342)

HarrySnart · mrDzurb · web-flow · commit 9751808274dc · 2026-02-20T11:10:47.000-08:00
Co-authored-by: Dmitrii Cherkasov &lt;dmitrii.cherkasov@oracle.com&gt;
diff --git a/ads/common/utils.py b/ads/common/utils.py
@@ -237,6 +237,31 @@ def parse_bool(value: Any) -> bool:
     return bool(value)
 
 
+def parse_int(value: Any, default: Optional[int] = None) -> Optional[int]:
+    """Converts a value to int.
+
+    Parameters
+    ----------
+    value: Any
+        The value to convert.
+    default: Optional[int]
+        The value to return if `value` is None.
+
+    Returns
+    -------
+    Optional[int]
+        The int value or `default`.
+
+    Raises
+    ------
+    ValueError
+        If `value` cannot be converted to int.
+    """
+    if value is None:
+        return default
+    return int(value)
+
+
 def read_file(file_path: str, **kwargs) -> str:
     try:
         with fsspec.open(file_path, "r", **kwargs.get("auth", {})) as f:
diff --git a/ads/model/deployment/model_deployment.py b/ads/model/deployment/model_deployment.py
@@ -1717,11 +1717,68 @@ def _build_model_deployment_configuration_details(self) -> Dict:
                 infrastructure.capacity_reservation_ids
             )
 
-        scaling_policy = {
-            infrastructure.CONST_POLICY_TYPE: "FIXED_SIZE",
-            infrastructure.CONST_INSTANCE_COUNT: infrastructure.replica
-            or DEFAULT_REPLICA,
-        }
+        def _drop_none_values(d: Dict) -> Dict:
+            """Drops keys with None values from the provided dict."""
+            return {k: v for k, v in d.items() if v is not None}
+
+        # Fixed-size is the default. If autoscaling is configured on infrastructure,
+        # emit an AUTOSCALING policy (supported for both SINGLE_MODEL and MODEL_GROUP).
+        auto_scaling = getattr(infrastructure, "auto_scaling", None) or {}
+        if auto_scaling:
+            scaling_type = str(auto_scaling.get("scalingType", "") or "").lower()
+            metric_type = scaling_type.upper()
+
+            scaling_policy = {
+                infrastructure.CONST_POLICY_TYPE: "AUTOSCALING",
+                "isEnabled": auto_scaling.get("isEnabled", True),
+                "coolDownInSeconds": auto_scaling.get("coolDownInSeconds", None),
+                "autoScalingPolicies": [
+                    _drop_none_values(
+                        {
+                            "autoScalingPolicyType": "THRESHOLD",
+                            "maximumInstanceCount": auto_scaling.get(
+                                "maximumInstanceCount", 3
+                            ),
+                            "minimumInstanceCount": auto_scaling.get(
+                                "minimumInstanceCount", 1
+                            ),
+                            "initialInstanceCount": auto_scaling.get(
+                                "initialInstanceCount",
+                                infrastructure.replica or DEFAULT_REPLICA,
+                            ),
+                            "rules": [
+                                {
+                                    "metricExpressionRuleType": "PREDEFINED_EXPRESSION",
+                                    "metricType": metric_type,
+                                    "scaleInConfiguration": _drop_none_values(
+                                        {
+                                            "scalingConfigurationType": "THRESHOLD",
+                                            "threshold": auto_scaling.get(
+                                                "scaleInThreshold", 30
+                                            ),
+                                        }
+                                    ),
+                                    "scaleOutConfiguration": _drop_none_values(
+                                        {
+                                            "scalingConfigurationType": "THRESHOLD",
+                                            "threshold": auto_scaling.get(
+                                                "scaleOutThreshold", 70
+                                            ),
+                                        }
+                                    ),
+                                }
+                            ],
+                        }
+                    )
+                ],
+            }
+            scaling_policy = _drop_none_values(scaling_policy)
+        else:
+            scaling_policy = {
+                infrastructure.CONST_POLICY_TYPE: "FIXED_SIZE",
+                infrastructure.CONST_INSTANCE_COUNT: infrastructure.replica
+                or DEFAULT_REPLICA,
+            }
 
         if not (runtime.model_uri or runtime.model_group_id):
             raise ValueError(
diff --git a/ads/model/deployment/model_deployment_infrastructure.py b/ads/model/deployment/model_deployment_infrastructure.py
@@ -155,6 +155,26 @@ class ModelDeploymentInfrastructure(Builder):
     CONST_PRIVATE_ENDPOINT_ID = "privateEndpointId"
     CONST_CAPACITY_RESERVATION_IDS = "capacityReservationIds"
 
+    # Autoscaling config (builder-only; used when constructing `scalingPolicy` payload).
+    # This can be applied to both SINGLE_MODEL and MODEL_GROUP deployments.
+    CONST_AUTO_SCALING = "autoScaling"
+    CONST_SCALING_TYPE = "scalingType"
+    CONST_MINIMUM_INSTANCE_COUNT = "minimumInstanceCount"
+    CONST_MAXIMUM_INSTANCE_COUNT = "maximumInstanceCount"
+    CONST_INITIAL_INSTANCE_COUNT = "initialInstanceCount"
+    CONST_SCALE_IN_THRESHOLD = "scaleInThreshold"
+    CONST_SCALE_OUT_THRESHOLD = "scaleOutThreshold"
+    CONST_COOL_DOWN_IN_SECONDS = "coolDownInSeconds"
+    CONST_IS_ENABLED = "isEnabled"
+
+    # Autoscaling constants (for `with_auto_scaling`).
+    CONST_SCALING_TYPE_CPU_UTILIZATION = "cpu_utilization"
+    CONST_SCALING_TYPE_MEMORY_UTILIZATION = "memory_utilization"
+    CONST_SUPPORTED_AUTO_SCALING_TYPES = (
+        CONST_SCALING_TYPE_CPU_UTILIZATION,
+        CONST_SCALING_TYPE_MEMORY_UTILIZATION,
+    )
+
     attribute_map = {
         CONST_PROJECT_ID: "project_id",
         CONST_COMPARTMENT_ID: "compartment_id",
@@ -172,6 +192,7 @@ class ModelDeploymentInfrastructure(Builder):
         CONST_SUBNET_ID: "subnet_id",
         CONST_PRIVATE_ENDPOINT_ID: "private_endpoint_id",
         CONST_CAPACITY_RESERVATION_IDS: "capacity_reservation_ids",
+        CONST_AUTO_SCALING: "auto_scaling",
     }
 
     shape_config_details_attribute_map = {
@@ -720,6 +741,83 @@ def with_capacity_reservation_ids(
             self.CONST_CAPACITY_RESERVATION_IDS, capacity_reservation_ids
         )
 
+    @property
+    def auto_scaling(self) -> Dict:
+        """Autoscaling configuration for model deployment.
+
+        This configuration is used when building the deployment payload to generate
+        an `AUTOSCALING` scaling policy.
+
+        Returns
+        -------
+        Dict
+            Autoscaling configuration.
+        """
+        return self.get_spec(self.CONST_AUTO_SCALING, {})
+
+    def with_auto_scaling(
+        self,
+        scaling_type: str,
+        minimum_instance_count: int = 1,
+        maximum_instance_count: int = 3,
+        initial_instance_count: int = None,
+        scale_in_threshold: int = 30,
+        scale_out_threshold: int = 70,
+        cool_down_in_seconds: int = None,
+        is_enabled: bool = True,
+    ) -> "ModelDeploymentInfrastructure":
+        """Enables threshold-based autoscaling.
+
+        Parameters
+        ----------
+        scaling_type: str
+            One of ["cpu_utilization", "memory_utilization"].
+        minimum_instance_count: int
+            Minimum number of instances (default: 1).
+        maximum_instance_count: int
+            Maximum number of instances (default: 3).
+        initial_instance_count: int
+            Initial number of instances.
+            Defaults to `replica` if set, otherwise `minimum_instance_count`.
+        scale_in_threshold: int
+            Threshold for scaling in (default: 30).
+        scale_out_threshold: int
+            Threshold for scaling out (default: 70).
+        cool_down_in_seconds: int
+            Optional cooldown period.
+        is_enabled: bool
+            Whether autoscaling is enabled (default: True).
+
+        Returns
+        -------
+        ModelDeploymentInfrastructure
+            The ModelDeploymentInfrastructure instance (self).
+        """
+        scaling_type = str(scaling_type or "").lower()
+        if scaling_type not in self.CONST_SUPPORTED_AUTO_SCALING_TYPES:
+            raise ValueError(
+                "Invalid scaling_type: {}. Allowed values: {}.".format(
+                    scaling_type, list(self.CONST_SUPPORTED_AUTO_SCALING_TYPES)
+                )
+            )
+
+        if initial_instance_count is None:
+            initial_instance_count = self.replica or minimum_instance_count
+
+        config = {
+            self.CONST_SCALING_TYPE: scaling_type,
+            self.CONST_MINIMUM_INSTANCE_COUNT: minimum_instance_count,
+            self.CONST_MAXIMUM_INSTANCE_COUNT: maximum_instance_count,
+            self.CONST_INITIAL_INSTANCE_COUNT: initial_instance_count,
+            self.CONST_SCALE_IN_THRESHOLD: scale_in_threshold,
+            self.CONST_SCALE_OUT_THRESHOLD: scale_out_threshold,
+            self.CONST_IS_ENABLED: bool(is_enabled),
+        }
+        if cool_down_in_seconds is not None:
+            config[self.CONST_COOL_DOWN_IN_SECONDS] = cool_down_in_seconds
+
+        return self.set_spec(self.CONST_AUTO_SCALING, config)
+
     def init(self, **kwargs) -> "ModelDeploymentInfrastructure":
         """Initializes a starter specification for the ModelDeploymentInfrastructure.
 
diff --git a/ads/model/deployment/model_deployment_properties.py b/ads/model/deployment/model_deployment_properties.py
@@ -53,11 +53,35 @@ class ModelDeploymentProperties(
         Return an instance of CreateModelDeploymentDetails for creating the deployment.
     """
 
+    # Autoscaling constants (for `with_instance_configuration`).
+    CONST_SCALING_TYPE_FIXED = "fixed"
+    CONST_SCALING_TYPE_CPU_UTILIZATION = "cpu_utilization"
+    CONST_SCALING_TYPE_MEMORY_UTILIZATION = "memory_utilization"
+    CONST_SUPPORTED_AUTO_SCALING_TYPES = (
+        CONST_SCALING_TYPE_CPU_UTILIZATION,
+        CONST_SCALING_TYPE_MEMORY_UTILIZATION,
+    )
+    CONST_SUPPORTED_SCALING_TYPES = (
+        CONST_SCALING_TYPE_FIXED,
+        *CONST_SUPPORTED_AUTO_SCALING_TYPES,
+    )
+
     # These properties are supported by ModelDeploymentProperties but are not top-level attributes of ModelDeployment
     sub_properties = [
         "instance_shape",
         "instance_count",
         "bandwidth_mbps",
+        # Autoscaling-related keys (used by `with_instance_configuration`).
+        "scaling_type",
+        "cpu_utilization",
+        "memory_utilization",
+        "minimum_instance_count",
+        "maximum_instance_count",
+        "initial_instance_count",
+        "scale_in_threshold",
+        "scale_out_threshold",
+        "cool_down_in_seconds",
+        "is_enabled",
         "access_log_group_id",
         "access_log_id",
         "predict_log_group_id",
@@ -205,6 +229,17 @@ def __init__(
             "bandwidth_mbps",
             "memory_in_gbs",
             "ocpus",
+            # Autoscaling-related keys.
+            "scaling_type",
+            "cpu_utilization",
+            "memory_utilization",
+            "minimum_instance_count",
+            "maximum_instance_count",
+            "initial_instance_count",
+            "scale_in_threshold",
+            "scale_out_threshold",
+            "cool_down_in_seconds",
+            "is_enabled",
         ]:
             if key in kwargs:
                 instance_config[key] = kwargs[key]
@@ -243,6 +278,22 @@ def with_instance_configuration(self, config):
             - memory_in_gbs: float,
             - ocpus: float
 
+            In addition, this method supports autoscaling for SINGLE_MODEL deployments.
+            To enable autoscaling, set `scaling_type` to one of:
+
+            - scaling_type: str. One of ["fixed", "cpu_utilization", "memory_utilization"]. Defaults to "fixed".
+
+            For `cpu_utilization` / `memory_utilization`, the following keys are supported
+            (all optional, sensible defaults will be applied):
+
+            - minimum_instance_count: int (default: 1)
+            - maximum_instance_count: int (default: 3)
+            - initial_instance_count: int (default: instance_count or minimum_instance_count)
+            - scale_in_threshold: int (default: 30)
+            - scale_out_threshold: int (default: 70)
+            - cool_down_in_seconds: int (optional)
+            - is_enabled: bool (default: True)
+
             The instance_shape and instance_count are required when creating a new deployment.
             They are optional when updating an existing deployment.
 
@@ -284,11 +335,86 @@ def with_instance_configuration(self, config):
             instance_configuration_object
         )
 
-        # scaling_policy is required even though it can be initialized with empty values
-        scaling_policy_object = data_science_models.FixedSizeScalingPolicy()
-        if "instance_count" in config:
-            scaling_policy_object.instance_count = int(config["instance_count"])
-        model_configuration_details_object.scaling_policy = scaling_policy_object
+        # scaling_policy is required even though it can be initialized with empty values.
+        # Keep backward compatible behaviour (FixedSizeScalingPolicy) as default,
+        # and enable threshold-based autoscaling for model deployments. 
+
+        scaling_type = config.get("scaling_type", None)
+
+        if not scaling_type:
+            scaling_type=self.CONST_SCALING_TYPE_FIXED
+
+
+        scaling_type = str(scaling_type).lower()
+        if scaling_type not in self.CONST_SUPPORTED_SCALING_TYPES:
+            raise ValueError(
+                "Invalid scaling_type: {}. Allowed values: {}.".format(
+                    scaling_type, list(self.CONST_SUPPORTED_SCALING_TYPES)
+                )
+            )
+
+        if scaling_type == self.CONST_SCALING_TYPE_FIXED:
+            scaling_policy_object = data_science_models.FixedSizeScalingPolicy()
+            if "instance_count" in config and config.get("instance_count") is not None:
+                scaling_policy_object.instance_count = int(config["instance_count"])
+            model_configuration_details_object.scaling_policy = scaling_policy_object
+        else:
+            # Metric type is the upper of the scaling_type kwarg.
+            # Example: cpu_utilization -> CPU_UTILIZATION
+            metric_type = scaling_type.upper()
+
+            minimum_instance_count = utils.parse_int(
+                config.get("minimum_instance_count", config.get("min_instance_count")),
+                1,
+            )
+            maximum_instance_count = utils.parse_int(
+                config.get("maximum_instance_count", config.get("max_instance_count")),
+                3,
+            )
+            # Backward compatibility: allow instance_count to act as initial_instance_count.
+
+            initial_instance_count = utils.parse_int(
+                config.get(
+                    "initial_instance_count",
+                    config.get("instance_count", minimum_instance_count),
+                ),
+                minimum_instance_count,
+            )
+
+            scale_in_threshold = utils.parse_int(config.get("scale_in_threshold"), 30)
+            scale_out_threshold = utils.parse_int(config.get("scale_out_threshold"), 70)
+            is_enabled = config.get("is_enabled", True)
+            cool_down_in_seconds = config.get("cool_down_in_seconds", None)
+
+            threshold_details = data_science_models.ThresholdBasedAutoScalingPolicyDetails(
+                auto_scaling_policy_type="THRESHOLD",
+                maximum_instance_count=maximum_instance_count,
+                minimum_instance_count=minimum_instance_count,
+                initial_instance_count=initial_instance_count,
+                rules=[
+                    data_science_models.PredefinedMetricExpressionRule(
+                        metric_type=metric_type,
+                        scale_in_configuration=data_science_models.PredefinedExpressionThresholdScalingConfiguration(
+                            threshold=scale_in_threshold
+                        ),
+                        scale_out_configuration=data_science_models.PredefinedExpressionThresholdScalingConfiguration(
+                            threshold=scale_out_threshold
+                        ),
+                    )
+                ],
+            )
+
+            auto_scaling_policy = data_science_models.AutoScalingPolicy(
+                policy_type="AUTOSCALING",
+                is_enabled=bool(is_enabled),
+                auto_scaling_policies=[threshold_details],
+            )
+            if cool_down_in_seconds is not None:
+                auto_scaling_policy.cool_down_in_seconds = utils.parse_int(
+                    cool_down_in_seconds
+                )
+
+            model_configuration_details_object.scaling_policy = auto_scaling_policy
 
         if "bandwidth_mbps" in config:
             model_configuration_details_object.bandwidth_mbps = config["bandwidth_mbps"]
diff --git a/tests/unitary/default_setup/model_deployment/test_model_deployment_properties.py b/tests/unitary/default_setup/model_deployment/test_model_deployment_properties.py
diff --git a/tests/unitary/default_setup/model_deployment/test_model_deployment_v2.py b/tests/unitary/default_setup/model_deployment/test_model_deployment_v2.py