diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml index 223c55c2f..413091589 100644 --- a/.github/workflows/hi-ml-pr.yml +++ b/.github/workflows/hi-ml-pr.yml @@ -224,7 +224,7 @@ jobs: with: flags: ${{ matrix.folder }} - himl_smoke_helloworld_v1: + smoke_helloworld_v1: runs-on: ubuntu-20.04 needs: [ cancel-azureml ] steps: @@ -242,7 +242,7 @@ jobs: cd hi-ml make smoke_helloworld_v1 - himl_smoke_helloworld_v2: + smoke_helloworld_v2: runs-on: ubuntu-20.04 needs: [ cancel-azureml ] steps: @@ -260,7 +260,7 @@ jobs: cd hi-ml make smoke_helloworld_v2 - himl_smoke_helloworld_v1_2nodes: + smoke_helloworld_v1_2nodes: runs-on: ubuntu-20.04 needs: [ cancel-azureml ] steps: @@ -278,7 +278,7 @@ jobs: cd hi-ml make smoke_helloworld_v1_2nodes - himl_smoke_helloworld_v2_2nodes: + smoke_helloworld_v2_2nodes: runs-on: ubuntu-20.04 needs: [ cancel-azureml ] steps: @@ -296,15 +296,53 @@ jobs: cd hi-ml make smoke_helloworld_v2_2nodes + smoke_helloworld_v1_crossval: + runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] + steps: + - uses: actions/checkout@v3 + with: + lfs: true + + - name: Set up Python ${{ env.pythonVersion }} + uses: ./.github/actions/prepare_himl_python_env + with: + python-version: ${{ env.pythonVersion }} + + - name: Run smoke_helloworld_v1_crossval + run: | + cd hi-ml + make smoke_helloworld_v1_crossval + + smoke_helloworld_v2_crossval: + runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] + steps: + - uses: actions/checkout@v3 + with: + lfs: true + + - name: Set up Python ${{ env.pythonVersion }} + uses: ./.github/actions/prepare_himl_python_env + with: + python-version: ${{ env.pythonVersion }} + + - name: Run smoke_helloworld_v2_crossval + run: | + cd hi-ml + make smoke_helloworld_v2_crossval + himl-smoke-tests-completed: # This job is just a placeholder to ensure that all smoke tests have completed before # publishing the package. Reference this job rather than the individual smoke tests. runs-on: ubuntu-20.04 needs: [ - himl_smoke_helloworld_v1, - himl_smoke_helloworld_v2, - himl_smoke_helloworld_v1_2nodes, - himl_smoke_helloworld_v2_2nodes, + smoke_helloworld_v1, + smoke_helloworld_v2, + smoke_helloworld_v1_2nodes, + smoke_helloworld_v2_2nodes, + smoke_helloworld_v1_crossval, + smoke_helloworld_v2_crossval, ] steps: - name: Smoke tests completed diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py index 953304a92..ac3c697a1 100644 --- a/hi-ml-azure/src/health_azure/himl.py +++ b/hi-ml-azure/src/health_azure/himl.py @@ -584,6 +584,7 @@ def create_command_job(cmd: str) -> Command: # underlying command such as experiment name and max_total_trials job_to_submit.experiment_name = experiment_name job_to_submit.set_limits(max_total_trials=hyperparam_args.get(MAX_TOTAL_TRIALS_ARG, None)) + job_to_submit.shm_size = docker_shm_size else: job_to_submit = create_command_job(cmd) diff --git a/hi-ml/Makefile b/hi-ml/Makefile index eb1ca58bf..645574e50 100644 --- a/hi-ml/Makefile +++ b/hi-ml/Makefile @@ -103,3 +103,15 @@ smoke_helloworld_v1_2nodes: # HelloWorld model training on 2 nodes, submitted using the v2 SDK smoke_helloworld_v2_2nodes: python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --num_nodes=2 --tag smoke_helloworld_v2_2nodes ${SHARED_ARGS} + +# HelloWorld model training via crossvalidation, submitted using the v1 SDK +# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in +# Hyperdrive jobs +smoke_helloworld_v1_crossval: + python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS} + +# HelloWorld model training via crossvalidation, submitted using the v2 SDK +# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in +# Hyperdrive jobs +smoke_helloworld_v2_crossval: + python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS} diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py index a9cc8562a..dd301596f 100644 --- a/hi-ml/src/health_ml/configs/hello_world.py +++ b/hi-ml/src/health_ml/configs/hello_world.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import os from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -15,6 +16,7 @@ from torch.utils.data import DataLoader, Dataset from health_ml.lightning_container import LightningContainer +from health_ml.utils.common_utils import get_docker_memory_gb TEST_MSE_FILE = "test_mse.txt" TEST_MAE_FILE = "test_mae.txt" @@ -291,3 +293,22 @@ def get_callbacks(self) -> List[Callback]: def get_additional_aml_run_tags(self) -> Dict[str, str]: return {"max_epochs": str(self.max_epochs)} + + +class HelloWorldWithMemoryCheck(HelloWorld): + """ + A variant of the HelloWorld container that checks that there is enough memory available to run the model. + """ + + def __init__(self) -> None: + super().__init__() + self.docker_shm_size_gb = 20 + self.docker_shm_size = f"{self.docker_shm_size_gb}g" + + def before_training_on_global_rank_zero(self) -> None: + docker_memory = get_docker_memory_gb(verbose=True) + assert docker_memory is not None + if docker_memory < self.docker_shm_size_gb: + raise ValueError( + f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {docker_memory} GB" + ) diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py index 5f56c9006..51fda60e7 100644 --- a/hi-ml/src/health_ml/deep_learning_config.py +++ b/hi-ml/src/health_ml/deep_learning_config.py @@ -168,6 +168,9 @@ class WorkflowParams(param.Parameterized): This class contains all parameters that affect how the whole training and testing workflow is executed. """ + docker_shm_size: str = param.String( + "400g", doc="The Docker shared memory size that is required to run this model in AzureML." + ) random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.") src_checkpoint: CheckpointParser = param.ClassSelector( class_=CheckpointParser, default=None, instantiate=False, doc=CheckpointParser.DOC diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py index b63baf963..11c62b188 100644 --- a/hi-ml/src/health_ml/experiment_config.py +++ b/hi-ml/src/health_ml/experiment_config.py @@ -43,7 +43,6 @@ class ExperimentConfig(param.Parameterized): "over the network). When running outside AzureML, datasets will " "always be mounted.", ) - docker_shm_size: str = param.String("400g", doc="The shared memory in the Docker image for the AzureML VMs.") wait_for_completion: bool = param.Boolean( default=False, doc="If True, wait for AML Run to complete before proceeding. If False, submit the run to AML and exit", diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py index de1ff4487..ea1d30ab2 100755 --- a/hi-ml/src/health_ml/runner.py +++ b/hi-ml/src/health_ml/runner.py @@ -50,7 +50,13 @@ from health_ml.training_runner import TrainingRunner # noqa: E402 from health_ml.utils import fixed_paths # noqa: E402 from health_ml.utils.logging import ConsoleAndFileOutput # noqa: E402 -from health_ml.utils.common_utils import check_conda_environment, choose_conda_env_file, is_linux # noqa: E402 +from health_ml.utils.common_utils import ( + check_conda_environment, + choose_conda_env_file, + get_docker_memory_gb, + get_memory_gb, + initialize_rpdb, +) # noqa: E402 from health_ml.utils.config_loader import ModelConfigLoader # noqa: E402 from health_ml.utils import health_ml_package_setup # noqa: E402 @@ -62,24 +68,6 @@ sys.argv[0] = str(runner_path.resolve()) -def initialize_rpdb() -> None: - """ - On Linux only, import and initialize rpdb, to enable remote debugging if necessary. - """ - # rpdb signal trapping does not work on Windows, as there is no SIGTRAP: - if not is_linux(): - return - import rpdb - - rpdb_port = 4444 - rpdb.handle_trap(port=rpdb_port) - # For some reason, os.getpid() does not return the ID of what appears to be the currently running process. - logging.info( - "rpdb is handling traps. To debug: identify the main runner.py process, then as root: " - f"kill -TRAP ; nc 127.0.0.1 {rpdb_port}" - ) - - def create_runner_parser() -> argparse.ArgumentParser: """ Creates a commandline parser, that understands all necessary arguments for training a model @@ -191,9 +179,9 @@ def run(self) -> Tuple[LightningContainer, AzureRunInfo]: # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable log_level = logging.INFO if is_local_rank_zero() else logging.ERROR logging_to_stdout(log_level) - # When running in Azure, also output logging to a file. This can help in particular when jobs - # get preempted, but we don't get access to the logs from the previous incarnation of the job initialize_rpdb() + get_memory_gb(verbose=True) + get_docker_memory_gb(verbose=True) self.parse_and_load_model() self.validate() azure_run_info = self.submit_to_azureml_if_needed() @@ -278,7 +266,7 @@ def submit_to_azureml_if_needed(self) -> AzureRunInfo: ignored_folders=[], submit_to_azureml=bool(self.experiment_config.cluster), docker_base_image=DEFAULT_DOCKER_BASE_IMAGE, - docker_shm_size=self.experiment_config.docker_shm_size, + docker_shm_size=self.lightning_container.docker_shm_size, hyperdrive_config=hyperdrive_config, hyperparam_args=hyperparam_args, display_name=self.lightning_container.tag, @@ -386,6 +374,8 @@ def run_with_logging(project_root: Path) -> Tuple[LightningContainer, AzureRunIn Start the main main entry point for training and testing models from the commandline. When running in Azure, this method also redirects the stdout stream, so that all console output is visible both on the console and stored in a file. The filename is timestamped and contains the DDP rank of the current process. + This can help in particular when jobs get preempted, but we don't get access to the logs from the previous + incarnation of the job. :param project_root: The root folder that contains all of the source code that should be executed. :return: If submitting to AzureML, returns the model configuration that was used for training, diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py index bf28d928d..6e94d87e3 100644 --- a/hi-ml/src/health_ml/utils/common_utils.py +++ b/hi-ml/src/health_ml/utils/common_utils.py @@ -3,12 +3,13 @@ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import logging import os from contextlib import contextmanager from datetime import datetime from enum import Enum, unique from pathlib import Path -from typing import Any, Generator, List, Optional +from typing import Any, Generator, List, Optional, Tuple import torch from torch.nn import Module @@ -19,10 +20,12 @@ MAX_PATH_LENGTH = 260 -# convert string to None if an empty string or whitespace is provided + +logger = logging.getLogger(__name__) def empty_string_to_none(x: Optional[str]) -> Optional[str]: + """convert string to None if an empty string or whitespace is provided""" return None if (x is None or len(x.strip()) == 0) else x @@ -226,3 +229,80 @@ def seed_monai_if_available(seed: int) -> None: set_determinism(seed=seed) except ImportError: pass + + +def initialize_rpdb() -> None: + """ + On Linux only, import and initialize rpdb, to enable remote debugging if necessary. + """ + # rpdb signal trapping does not work on Windows, as there is no SIGTRAP: + if not is_linux(): + return + import rpdb + + rpdb_port = 4444 + rpdb.handle_trap(port=rpdb_port) + # For some reason, os.getpid() does not return the ID of what appears to be the currently running process. + logger.info( + "rpdb is handling traps. To debug: identify the main runner.py process, then as root: " + f"kill -TRAP ; nc 127.0.0.1 {rpdb_port}" + ) + + +def _is_running_in_docker() -> bool: + """Return True if the present process is likely to run inside a Docker container""" + return Path("/.dockerenv").exists() + + +def get_docker_memory_gb(verbose: bool = False) -> Optional[float]: + """Get the total amount of memory when running in a Docker container. If the process does not + appear to run in a Docker container, return None. + + :param verbose: If True, print the amount of total Docker memory to stdout. + :return: The total amount of Docker memory in GB, or None if the process is not running in Docker. + """ + if is_linux() and _is_running_in_docker(): + mem_limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes") + try: + mem_limit = int(mem_limit_path.read_text().strip()) + except: + logger.warning(f"Unable to read {mem_limit_path} or process the contents.") + return None + byte_per_GB = 1024.0**3 + docker_gb = round(mem_limit / byte_per_GB, 3) + if verbose: + print(f"Total Docker memory: {docker_gb} GB") + return docker_gb + + if verbose: + print("Unable to determine Docker memory size because the process does not appear to run in Docker.") + return None + + +def get_memory_gb(verbose: bool = False) -> Optional[Tuple[float, float, float, float]]: + """Get the CPU memory, available CPU memory, total memory (CPU plus swap), available memory in GB. + This relies on the Linux 'free' command being available. The function returns None if the command is not available. + + :param verbose: If True, print the output of the 'free' command to stdout. + :return: Tuple of (CPU memory, CPU memory available, total memory, total memory available), all in GB. + """ + free_commandline = "free -t -m" + try: + free_output = os.popen(free_commandline).readlines() + except: + logger.warning(f"Unable to run '{free_commandline}'") + return None + if verbose: + print(f"Checking available memory. Result of running '{free_commandline}' (available memory in MB):") + for line in free_output: + # Lines still contain a newline at the end, so no need to add that + print(line, end="") + if len(free_output) < 4: + logger.warning(f"Unexpected result when running '{free_commandline}': {free_output}") + return None + cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4]) + total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4]) + MB_per_GB = 1024.0 + values = (cpu_mem, cpu_mem_available, total_mem, total_mem_available) + result = tuple(map(lambda x: round(x / MB_per_GB, 3), values)) + return result # type: ignore diff --git a/hi-ml/testhiml/testhiml/test_common_utils.py b/hi-ml/testhiml/testhiml/test_common_utils.py index 692e7ab09..62a32964f 100644 --- a/hi-ml/testhiml/testhiml/test_common_utils.py +++ b/hi-ml/testhiml/testhiml/test_common_utils.py @@ -2,12 +2,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +from io import StringIO from pathlib import Path -from unittest.mock import patch +from unittest.mock import MagicMock, patch import os import pytest from health_ml.utils import common_utils +from health_ml.utils.common_utils import get_docker_memory_gb, get_memory_gb, is_linux @pytest.mark.parametrize("os_name, expected_val", [("nt", True), ("None", False), ("posix", False), ("", False)]) @@ -34,3 +36,90 @@ def test_change_working_directory(tmp_path: Path) -> None: assert str(Path.cwd()) == tmp_path_str # outside of the context, the original working directory should be restored assert str(Path.cwd()) == orig_cwd_str != tmp_path_str + + +@pytest.mark.skipif(not is_linux(), reason="Test only runs on Linux") +def test_available_memory(capsys: pytest.CaptureFixture) -> None: + """ + Test that get_memory_gb returns a value greater than 0 + """ + # All tests run on Linux, so the result should be available + + result = get_memory_gb(verbose=False) + assert result is not None + assert len(result) == 4 + for val in result: + assert isinstance(val, float) + assert val > 0.0 + stdout: str = capsys.readouterr().out + assert len(stdout) == 0 + + +@pytest.mark.skipif(not is_linux(), reason="Test only runs on Linux") +def test_available_memory_prints(capsys: pytest.CaptureFixture) -> None: + """ + Test that get_memory_gb prints the result of running 'free' + """ + # All tests run on Linux, so the result should be available + + get_memory_gb(verbose=True) + stdout: str = capsys.readouterr().out + assert len(stdout.splitlines()) == 5 + + +def test_available_memory_reads_correctly(capsys: pytest.CaptureFixture) -> None: + """ + Test that get_memory_gb picks the right fields of the output of 'free' + """ + free_output = """ total used free shared buff/cache available +Mem: 9950 3316 5133 2 1500 6332 +Swap: 3072 15 3056 +Total: 13022 3331 8190 +""" + with patch("os.popen", return_value=StringIO(free_output)): + result = get_memory_gb(verbose=True) + assert result is not None + assert len(result) == 4 + assert result == (9.717, 5.013, 12.717, 7.998) + stdout: str = capsys.readouterr().out + assert free_output in stdout + + +def test_docker_memory(capsys: pytest.CaptureFixture) -> None: + """Test that docker_memory returns the expected values""" + with patch.multiple( + "health_ml.utils.common_utils", + is_linux=MagicMock(return_value=True), + _is_running_in_docker=MagicMock(return_value=True), + ): + expected_gb = 1.234 + with patch("pathlib.Path.read_text", return_value=str(int(expected_gb * 1024**3))): + # Test that the function returns the expected value, and that it prints nothing when verbose=False + result = get_docker_memory_gb(verbose=False) + assert result == expected_gb + stdout: str = capsys.readouterr().out + assert len(stdout.splitlines()) == 0 + # Test that the function prints the expected value when verbose=True + result = get_docker_memory_gb(verbose=True) + assert result == expected_gb + stdout = capsys.readouterr().out + assert stdout.splitlines() == [f"Total Docker memory: {expected_gb} GB"] + + with patch("pathlib.Path.read_text", side_effect=ValueError): + assert get_docker_memory_gb() is None + + +def test_docker_memory_outside_docker(capsys: pytest.CaptureFixture) -> None: + """Test that docker_memory returns None if not running in Docker""" + with patch.multiple( + "health_ml.utils.common_utils", + is_linux=MagicMock(return_value=True), + _is_running_in_docker=MagicMock(return_value=False), + ): + assert get_docker_memory_gb() is None + stdout: str = capsys.readouterr().out + assert len(stdout.splitlines()) == 0 + # Test that the function prints the expected value when verbose=True + assert get_docker_memory_gb(verbose=True) is None + stdout = capsys.readouterr().out + assert stdout.startswith("Unable to determine Docker memory") diff --git a/hi-ml/testhiml/testhiml/test_runner.py b/hi-ml/testhiml/testhiml/test_runner.py index 7ad39b903..f22528b23 100644 --- a/hi-ml/testhiml/testhiml/test_runner.py +++ b/hi-ml/testhiml/testhiml/test_runner.py @@ -374,7 +374,7 @@ def test_submit_to_azure_docker(mock_runner: Runner) -> None: # call_args is a tuple of (args, kwargs) call_kwargs = mock_submit_to_aml.call_args[1] # Submission to AzureML should have been turned on because a cluster name was supplied - assert mock_runner.experiment_config.docker_shm_size == docker_shm_size + assert mock_runner.lightning_container.docker_shm_size == docker_shm_size assert call_kwargs["docker_shm_size"] == docker_shm_size