microsoft · mebristo · Nov 9, 2022 · Nov 9, 2022 · Nov 9, 2022
diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py
@@ -40,7 +40,7 @@
                                 run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace,
                                 PathOrString, DEFAULT_ENVIRONMENT_VARIABLES, get_ml_client,
                                 create_python_environment_v2, register_environment_v2, V2_INPUT_DATASET_PATTERN,
-                                V2_OUTPUT_DATASET_PATTERN)
+                                V2_OUTPUT_DATASET_PATTERN, SLATierType)
 from health_azure.datasets import (DatasetConfig, StrOrDatasetConfig, setup_local_datasets,
                                    _input_dataset_key, _output_dataset_key, _replace_string_datasets)
 
@@ -422,7 +422,13 @@ def submit_run_v2(workspace: Optional[Workspace],
                   wait_for_completion_show_output: bool = False,
                   workspace_config_path: Optional[PathOrString] = None,
                   ml_client: Optional[MLClient] = None,
-                  hyperparam_args: Optional[Dict[str, Any]] = None) -> Job:
+                  docker_base_image: str = "",
+                  docker_shm_size: str = "",
+                  sla_tier: Optional[SLATierType] = None,
+                  instance_type: Optional[str] = None,
+                  instance_count: int = 1,
+                  hyperparam_args: Optional[Dict[str, Any]] = None,
+                  ) -> Job:
     """
     Starts a v2 AML Job on a given workspace by submitting a command
 
@@ -444,6 +450,14 @@ def submit_run_v2(workspace: Optional[Workspace],
         the completion of this run (if True).
     :param wait_for_completion_show_output: If wait_for_completion is True this parameter indicates whether to show the
         run output on sys.stdout.
+    :param docker_base_image: The Docker base image that should be specified when submitting a job to a virtual cluster.
+    :param docker_shm_size: The Docker shared memory size that should be used when creating a new Docker image.
+    :param pip_extra_index_url: If provided, use this PIP package index to find additional packages when building
+        the Docker image.
+    :param sla_tier: Optional service level acccess tier if submitting to compute that requires this specification.
+        Should be one of 'Premium', 'Standard', 'Basic' or None.
+    :param instance_type: If more than one instance type is available in the virtual cluster, specify which type to use.
+    :param instance_count: The number of Azure ML nodes to request for the job.
     :param workspace_config_path: If not provided with an AzureML Workspace, then load one given the information in this
         config
     :param ml_client: An Azure MLClient object for interacting with Azure resources.
@@ -483,7 +497,25 @@ def submit_run_v2(workspace: Optional[Workspace],
     else:
         output_datasets_v2 = {}
 
-    job_to_submit: Union[Command, Sweep]
+    sla_tier_str = sla_tier.value if sla_tier else ""
+
+    # The following parameters are specified for submitting jobs to Singularity. Other compute types will ignore th
+    job_resources = {
+        "instance_type": instance_type,
+        "instance_count": instance_count,
+        "properties": {
+            "AISuperComputer": {
+                "interactive": False,
+                "imageVersion": docker_base_image,
+                "slaTier": sla_tier_str,
+                "tensorboardLogDirectory": "/scratch/tensorboard_logs",
+                "scalePolicy": {
+                    "autoScaleIntervalInSec": 120,
+                    "maxInstanceTypeCount": 1,
+                    "minInstanceTypeCount": 1,
+                },
+                "sshPublicKeys": []}}
+    }
 
     if hyperparam_args:
         param_sampling = hyperparam_args[PARAM_SAMPLING_ARG]
@@ -500,6 +532,8 @@ def submit_run_v2(workspace: Optional[Workspace],
             environment=environment.name + "@latest",
             compute=compute_target,
             experiment_name=experiment_name,
+            shm_size=docker_shm_size,
+            resources=job_resources,
             environment_variables={
                 "JOB_EXECUTION_MODE": "Basic",
             }
@@ -530,6 +564,8 @@ def submit_run_v2(workspace: Optional[Workspace],
             environment=environment.name + "@latest",
             compute=compute_target,
             experiment_name=experiment_name,
+            shm_size=docker_shm_size,
+            resources=job_resources,
             environment_variables={
                 "JOB_EXECUTION_MODE": "Basic",
             }
@@ -704,6 +740,9 @@ def submit_to_azure_if_needed(  # type: ignore
         hyperparam_args: Optional[Dict[str, Any]] = None,
         create_output_folders: bool = True,
         strictly_aml_v1: bool = False,
+        sla_tier: Optional[SLATierType] = None,
+        instance_type: Optional[str] = None,
+        instance_count: int = 1,
 ) -> AzureRunInfo:  # pragma: no cover
     """
     Submit a folder to Azure, if needed and run it.
@@ -759,6 +798,10 @@ def submit_to_azure_if_needed(  # type: ignore
     :param hyperdrive_config: A configuration object for Hyperdrive (hyperparameter search).
     :param create_output_folders: If True (default), create folders "outputs" and "logs" in the current working folder.
     :param strictly_aml_v1: If True, use Azure ML SDK v1. Otherwise, attempt to use Azure ML SDK v2.
+    :param sla_tier: Optional service level acccess tier if submitting to compute that requires this specification.
+        Should be one of 'Premium', 'Standard', 'Basic' or None.
+    :param instance_type: If more than one instance type is available in the virtual cluster, specify which type to use.
+    :param instance_count: The number of Azure ML nodes to request for the job.
     :return: If the script is submitted to AzureML then we terminate python as the script should be executed in AzureML,
         otherwise we return a AzureRunInfo object.
     """
@@ -892,6 +935,11 @@ def submit_to_azure_if_needed(  # type: ignore
                                 tags=tags,
                                 wait_for_completion=wait_for_completion,
                                 wait_for_completion_show_output=wait_for_completion_show_output,
+                                docker_base_image=docker_base_image,
+                                docker_shm_size=docker_shm_size,
+                                sla_tier=sla_tier,
+                                instance_type=instance_type,
+                                instance_count=instance_count,
                                 hyperparam_args=hyperparam_args
                                 )
 

diff --git a/hi-ml-azure/src/health_azure/utils.py b/hi-ml-azure/src/health_azure/utils.py
@@ -112,6 +112,12 @@
 PathOrString = Union[Path, str]
 
 
+class SLATierType(Enum):
+    Premium = "Premium"
+    Standard = "Standard"
+    Basic = "Basic"
+
+
 class IntTuple(param.NumericTuple):
     """
     Parameter class that must always have integer values

diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py
@@ -22,7 +22,7 @@
 from health_azure.amulet import (ENV_AMLT_PROJECT_NAME, ENV_AMLT_INPUT_OUTPUT,
                                  ENV_AMLT_SNAPSHOT_DIR, ENV_AMLT_AZ_BATCHAI_DIR,
                                  is_amulet_job, get_amulet_aml_working_dir)
-from health_azure.utils import (RUN_CONTEXT, PathOrString, is_global_rank_zero, is_running_in_azure_ml)
+from health_azure.utils import (RUN_CONTEXT, PathOrString, is_global_rank_zero, is_running_in_azure_ml, SLATierType)
 from health_ml.utils import fixed_paths
 from health_ml.utils.common_utils import (CHECKPOINT_FOLDER,
                                           create_unique_timestamp_id,
@@ -574,3 +574,15 @@ def num_gpus_per_node(self) -> int:
             logging.warning(
                 f"You requested max_num_gpus {self.max_num_gpus} but there are only {num_gpus} available.")
         return num_gpus
+
+
+class ComputeParams(param.Parameterized):
+    sla_tier: Optional[SLATierType] = param.ClassSelector(
+        default=None, class_=SLATierType, instantiate=False, doc="Optional service level agreement tier"
+    )
+    instance_type: Optional[str] = param.String(default="", doc="If more than one instance type is available "
+                                                "in the cluster, specify which type to use")
+    instance_count: Optional[int] = param.Integer(default=1, doc="Optional number of AML nodes to request in a "
+                                                  "virtual cluster")
+    image_version: Optional[str] = param.String(default="", doc="The base Docker image to use for environment "
+                                                "building in AML.")
diff --git a/hi-ml/src/health_ml/lightning_container.py b/hi-ml/src/health_ml/lightning_container.py
@@ -13,8 +13,8 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
 from health_azure.utils import create_from_matching_params
-from health_ml.deep_learning_config import DatasetParams, OptimizerParams, OutputParams, TrainerParams, \
-    WorkflowParams
+from health_ml.deep_learning_config import (DatasetParams, OptimizerParams, OutputParams, TrainerParams,
+                                            WorkflowParams, ComputeParams)
 from health_ml.experiment_config import ExperimentConfig
 from health_ml.utils.checkpoint_utils import get_best_checkpoint_path
 from health_ml.utils.lr_scheduler import SchedulerWithWarmUp
@@ -25,7 +25,8 @@ class LightningContainer(WorkflowParams,
                          DatasetParams,
                          OutputParams,
                          TrainerParams,
-                         OptimizerParams):
+                         OptimizerParams,
+                         ComputeParams):
     """
     A LightningContainer contains all information to train a user-specified PyTorch Lightning model. The model that
     should be trained is returned by the `get_model` method. The training data must be returned in the form of

diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
@@ -252,6 +252,11 @@ def after_submission_hook(azure_run: Run) -> None:
                                    datastore=datastore,
                                    use_mounting=use_mounting)
 
+        sla_tier = self.lightning_container.sla_tier
+        instance_type = self.lightning_container.instance_type
+        image_version = self.lightning_container.image_version
+        instance_count = self.lightning_container.instance_count
+
         if self.experiment_config.strictly_aml_v1:
             hyperdrive_config = self.lightning_container.get_hyperdrive_config()
             hyperparam_args = None
@@ -282,14 +287,17 @@ def after_submission_hook(azure_run: Run) -> None:
                 wait_for_completion=self.experiment_config.wait_for_completion,
                 ignored_folders=[],
                 submit_to_azureml=bool(self.experiment_config.cluster),
-                docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
+                docker_base_image=image_version or DEFAULT_DOCKER_BASE_IMAGE,
                 docker_shm_size=self.experiment_config.docker_shm_size,
                 hyperdrive_config=hyperdrive_config,
                 hyperparam_args=hyperparam_args,
                 create_output_folders=False,
                 after_submission=after_submission_hook,
                 tags=self.additional_run_tags(script_params),
                 strictly_aml_v1=self.experiment_config.strictly_aml_v1,
+                sla_tier=sla_tier,
+                instance_type=instance_type,
+                instance_count=instance_count,
             )
         else:
             azure_run_info = submit_to_azure_if_needed(