Skip to content

ENH: Add params for submitting to singularity [DRAFT] #652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 51 additions & 3 deletions hi-ml-azure/src/health_azure/himl.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace,
PathOrString, DEFAULT_ENVIRONMENT_VARIABLES, get_ml_client,
create_python_environment_v2, register_environment_v2, V2_INPUT_DATASET_PATTERN,
V2_OUTPUT_DATASET_PATTERN)
V2_OUTPUT_DATASET_PATTERN, SLATierType)
from health_azure.datasets import (DatasetConfig, StrOrDatasetConfig, setup_local_datasets,
_input_dataset_key, _output_dataset_key, _replace_string_datasets)

Expand Down Expand Up @@ -422,7 +422,13 @@ def submit_run_v2(workspace: Optional[Workspace],
wait_for_completion_show_output: bool = False,
workspace_config_path: Optional[PathOrString] = None,
ml_client: Optional[MLClient] = None,
hyperparam_args: Optional[Dict[str, Any]] = None) -> Job:
docker_base_image: str = "",
docker_shm_size: str = "",
sla_tier: Optional[SLATierType] = None,
instance_type: Optional[str] = None,
instance_count: int = 1,
hyperparam_args: Optional[Dict[str, Any]] = None,
) -> Job:
"""
Starts a v2 AML Job on a given workspace by submitting a command

Expand All @@ -444,6 +450,14 @@ def submit_run_v2(workspace: Optional[Workspace],
the completion of this run (if True).
:param wait_for_completion_show_output: If wait_for_completion is True this parameter indicates whether to show the
run output on sys.stdout.
:param docker_base_image: The Docker base image that should be specified when submitting a job to a virtual cluster.
:param docker_shm_size: The Docker shared memory size that should be used when creating a new Docker image.
:param pip_extra_index_url: If provided, use this PIP package index to find additional packages when building
the Docker image.
:param sla_tier: Optional service level acccess tier if submitting to compute that requires this specification.
Should be one of 'Premium', 'Standard', 'Basic' or None.
:param instance_type: If more than one instance type is available in the virtual cluster, specify which type to use.
:param instance_count: The number of Azure ML nodes to request for the job.
:param workspace_config_path: If not provided with an AzureML Workspace, then load one given the information in this
config
:param ml_client: An Azure MLClient object for interacting with Azure resources.
Expand Down Expand Up @@ -483,7 +497,25 @@ def submit_run_v2(workspace: Optional[Workspace],
else:
output_datasets_v2 = {}

job_to_submit: Union[Command, Sweep]
sla_tier_str = sla_tier.value if sla_tier else ""

# The following parameters are specified for submitting jobs to Singularity. Other compute types will ignore th
job_resources = {
"instance_type": instance_type,
"instance_count": instance_count,
"properties": {
"AISuperComputer": {
"interactive": False,
"imageVersion": docker_base_image,
"slaTier": sla_tier_str,
"tensorboardLogDirectory": "/scratch/tensorboard_logs",
"scalePolicy": {
"autoScaleIntervalInSec": 120,
"maxInstanceTypeCount": 1,
"minInstanceTypeCount": 1,
},
"sshPublicKeys": []}}
}

if hyperparam_args:
param_sampling = hyperparam_args[PARAM_SAMPLING_ARG]
Expand All @@ -500,6 +532,8 @@ def submit_run_v2(workspace: Optional[Workspace],
environment=environment.name + "@latest",
compute=compute_target,
experiment_name=experiment_name,
shm_size=docker_shm_size,
resources=job_resources,
environment_variables={
"JOB_EXECUTION_MODE": "Basic",
}
Expand Down Expand Up @@ -530,6 +564,8 @@ def submit_run_v2(workspace: Optional[Workspace],
environment=environment.name + "@latest",
compute=compute_target,
experiment_name=experiment_name,
shm_size=docker_shm_size,
resources=job_resources,
environment_variables={
"JOB_EXECUTION_MODE": "Basic",
}
Expand Down Expand Up @@ -704,6 +740,9 @@ def submit_to_azure_if_needed( # type: ignore
hyperparam_args: Optional[Dict[str, Any]] = None,
create_output_folders: bool = True,
strictly_aml_v1: bool = False,
sla_tier: Optional[SLATierType] = None,
instance_type: Optional[str] = None,
instance_count: int = 1,
) -> AzureRunInfo: # pragma: no cover
"""
Submit a folder to Azure, if needed and run it.
Expand Down Expand Up @@ -759,6 +798,10 @@ def submit_to_azure_if_needed( # type: ignore
:param hyperdrive_config: A configuration object for Hyperdrive (hyperparameter search).
:param create_output_folders: If True (default), create folders "outputs" and "logs" in the current working folder.
:param strictly_aml_v1: If True, use Azure ML SDK v1. Otherwise, attempt to use Azure ML SDK v2.
:param sla_tier: Optional service level acccess tier if submitting to compute that requires this specification.
Should be one of 'Premium', 'Standard', 'Basic' or None.
:param instance_type: If more than one instance type is available in the virtual cluster, specify which type to use.
:param instance_count: The number of Azure ML nodes to request for the job.
:return: If the script is submitted to AzureML then we terminate python as the script should be executed in AzureML,
otherwise we return a AzureRunInfo object.
"""
Expand Down Expand Up @@ -892,6 +935,11 @@ def submit_to_azure_if_needed( # type: ignore
tags=tags,
wait_for_completion=wait_for_completion,
wait_for_completion_show_output=wait_for_completion_show_output,
docker_base_image=docker_base_image,
docker_shm_size=docker_shm_size,
sla_tier=sla_tier,
instance_type=instance_type,
instance_count=instance_count,
hyperparam_args=hyperparam_args
)

Expand Down
6 changes: 6 additions & 0 deletions hi-ml-azure/src/health_azure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@
PathOrString = Union[Path, str]


class SLATierType(Enum):
Premium = "Premium"
Standard = "Standard"
Basic = "Basic"


class IntTuple(param.NumericTuple):
"""
Parameter class that must always have integer values
Expand Down
14 changes: 13 additions & 1 deletion hi-ml/src/health_ml/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from health_azure.amulet import (ENV_AMLT_PROJECT_NAME, ENV_AMLT_INPUT_OUTPUT,
ENV_AMLT_SNAPSHOT_DIR, ENV_AMLT_AZ_BATCHAI_DIR,
is_amulet_job, get_amulet_aml_working_dir)
from health_azure.utils import (RUN_CONTEXT, PathOrString, is_global_rank_zero, is_running_in_azure_ml)
from health_azure.utils import (RUN_CONTEXT, PathOrString, is_global_rank_zero, is_running_in_azure_ml, SLATierType)
from health_ml.utils import fixed_paths
from health_ml.utils.common_utils import (CHECKPOINT_FOLDER,
create_unique_timestamp_id,
Expand Down Expand Up @@ -574,3 +574,15 @@ def num_gpus_per_node(self) -> int:
logging.warning(
f"You requested max_num_gpus {self.max_num_gpus} but there are only {num_gpus} available.")
return num_gpus


class ComputeParams(param.Parameterized):
sla_tier: Optional[SLATierType] = param.ClassSelector(
default=None, class_=SLATierType, instantiate=False, doc="Optional service level agreement tier"
)
instance_type: Optional[str] = param.String(default="", doc="If more than one instance type is available "
"in the cluster, specify which type to use")
instance_count: Optional[int] = param.Integer(default=1, doc="Optional number of AML nodes to request in a "
"virtual cluster")
image_version: Optional[str] = param.String(default="", doc="The base Docker image to use for environment "
"building in AML.")
7 changes: 4 additions & 3 deletions hi-ml/src/health_ml/lightning_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from torch.optim.lr_scheduler import _LRScheduler

from health_azure.utils import create_from_matching_params
from health_ml.deep_learning_config import DatasetParams, OptimizerParams, OutputParams, TrainerParams, \
WorkflowParams
from health_ml.deep_learning_config import (DatasetParams, OptimizerParams, OutputParams, TrainerParams,
WorkflowParams, ComputeParams)
from health_ml.experiment_config import ExperimentConfig
from health_ml.utils.checkpoint_utils import get_best_checkpoint_path
from health_ml.utils.lr_scheduler import SchedulerWithWarmUp
Expand All @@ -25,7 +25,8 @@ class LightningContainer(WorkflowParams,
DatasetParams,
OutputParams,
TrainerParams,
OptimizerParams):
OptimizerParams,
ComputeParams):
"""
A LightningContainer contains all information to train a user-specified PyTorch Lightning model. The model that
should be trained is returned by the `get_model` method. The training data must be returned in the form of
Expand Down
10 changes: 9 additions & 1 deletion hi-ml/src/health_ml/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,11 @@ def after_submission_hook(azure_run: Run) -> None:
datastore=datastore,
use_mounting=use_mounting)

sla_tier = self.lightning_container.sla_tier
instance_type = self.lightning_container.instance_type
image_version = self.lightning_container.image_version
instance_count = self.lightning_container.instance_count

if self.experiment_config.strictly_aml_v1:
hyperdrive_config = self.lightning_container.get_hyperdrive_config()
hyperparam_args = None
Expand Down Expand Up @@ -282,14 +287,17 @@ def after_submission_hook(azure_run: Run) -> None:
wait_for_completion=self.experiment_config.wait_for_completion,
ignored_folders=[],
submit_to_azureml=bool(self.experiment_config.cluster),
docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
docker_base_image=image_version or DEFAULT_DOCKER_BASE_IMAGE,
docker_shm_size=self.experiment_config.docker_shm_size,
hyperdrive_config=hyperdrive_config,
hyperparam_args=hyperparam_args,
create_output_folders=False,
after_submission=after_submission_hook,
tags=self.additional_run_tags(script_params),
strictly_aml_v1=self.experiment_config.strictly_aml_v1,
sla_tier=sla_tier,
instance_type=instance_type,
instance_count=instance_count,
)
else:
azure_run_info = submit_to_azure_if_needed(
Expand Down