From 448563296501d4834d9648374e439d7c54346d2c Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Wed, 3 May 2023 17:21:27 +0100
Subject: [PATCH 1/9] adding tests for crossval

---
 .github/workflows/hi-ml-pr.yml | 44 ++++++++++++++++++++++++++++++----
 hi-ml/Makefile                 |  8 +++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml
index 223c55c2f..95045f6b6 100644
--- a/.github/workflows/hi-ml-pr.yml
+++ b/.github/workflows/hi-ml-pr.yml
@@ -224,7 +224,7 @@ jobs:
         with:
             flags: ${{ matrix.folder }}
 
-  himl_smoke_helloworld_v1:
+  smoke_helloworld_v1:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -242,7 +242,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v1
 
-  himl_smoke_helloworld_v2:
+  smoke_helloworld_v2:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -260,7 +260,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v2
 
-  himl_smoke_helloworld_v1_2nodes:
+  smoke_helloworld_v1_2nodes:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -278,7 +278,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v1_2nodes
 
-  himl_smoke_helloworld_v2_2nodes:
+  smoke_helloworld_v2_2nodes:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -296,6 +296,42 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v2_2nodes
 
+  smoke_helloworld_v1_crossval:
+    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          lfs: true
+
+      - name: Set up Python ${{ env.pythonVersion }}
+        uses: ./.github/actions/prepare_himl_python_env
+        with:
+          python-version: ${{ env.pythonVersion }}
+
+      - name: Run smoke_helloworld_v1_crossval
+        run: |
+          cd hi-ml
+          make smoke_helloworld_v1_crossval
+
+  smoke_helloworld_v2_crossval:
+    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          lfs: true
+
+      - name: Set up Python ${{ env.pythonVersion }}
+        uses: ./.github/actions/prepare_himl_python_env
+        with:
+          python-version: ${{ env.pythonVersion }}
+
+      - name: Run smoke_helloworld_v2_crossval
+        run: |
+          cd hi-ml
+          make smoke_helloworld_v2_crossval
+
   himl-smoke-tests-completed:
     # This job is just a placeholder to ensure that all smoke tests have completed before
     # publishing the package. Reference this job rather than the individual smoke tests.
diff --git a/hi-ml/Makefile b/hi-ml/Makefile
index eb1ca58bf..b8df80ea9 100644
--- a/hi-ml/Makefile
+++ b/hi-ml/Makefile
@@ -103,3 +103,11 @@ smoke_helloworld_v1_2nodes:
 # HelloWorld model training on 2 nodes, submitted using the v2 SDK
 smoke_helloworld_v2_2nodes:
 	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --num_nodes=2 --tag smoke_helloworld_v2_2nodes ${SHARED_ARGS}
+
+# HelloWorld model training via crossvalidation, submitted using the v1 SDK
+smoke_helloworld_v1_crossval:
+	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS}
+
+# HelloWorld model training via crossvalidation, submitted using the v2 SDK
+smoke_helloworld_v2_crossval:
+	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS}

From 0c1cd2cc14fad3ba5190c302707ec581d3883190 Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Wed, 3 May 2023 17:25:04 +0100
Subject: [PATCH 2/9] fix yaml

---
 .github/workflows/hi-ml-pr.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml
index 95045f6b6..413091589 100644
--- a/.github/workflows/hi-ml-pr.yml
+++ b/.github/workflows/hi-ml-pr.yml
@@ -337,10 +337,12 @@ jobs:
     # publishing the package. Reference this job rather than the individual smoke tests.
     runs-on: ubuntu-20.04
     needs: [
-         himl_smoke_helloworld_v1,
-         himl_smoke_helloworld_v2,
-         himl_smoke_helloworld_v1_2nodes,
-         himl_smoke_helloworld_v2_2nodes,
+         smoke_helloworld_v1,
+         smoke_helloworld_v2,
+         smoke_helloworld_v1_2nodes,
+         smoke_helloworld_v2_2nodes,
+         smoke_helloworld_v1_crossval,
+         smoke_helloworld_v2_crossval,
       ]
     steps:
       - name: Smoke tests completed

From 3d626ff6135886f8bd57ef204f44067c3c6cca11 Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Wed, 3 May 2023 17:49:05 +0100
Subject: [PATCH 3/9] try fixing shm_size

---
 hi-ml-azure/src/health_azure/himl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py
index 953304a92..ac3c697a1 100644
--- a/hi-ml-azure/src/health_azure/himl.py
+++ b/hi-ml-azure/src/health_azure/himl.py
@@ -584,6 +584,7 @@ def create_command_job(cmd: str) -> Command:
         # underlying command such as experiment name and max_total_trials
         job_to_submit.experiment_name = experiment_name
         job_to_submit.set_limits(max_total_trials=hyperparam_args.get(MAX_TOTAL_TRIALS_ARG, None))
+        job_to_submit.shm_size = docker_shm_size
 
     else:
         job_to_submit = create_command_job(cmd)

From 497db2237d5807d9b967bd39cade53250c55fabf Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 11:20:19 +0100
Subject: [PATCH 4/9] add memory stats

---
 hi-ml/src/health_ml/runner.py                | 30 ++++--------
 hi-ml/src/health_ml/utils/common_utils.py    | 50 +++++++++++++++++++-
 hi-ml/testhiml/testhiml/test_common_utils.py | 49 +++++++++++++++++++
 3 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
index de1ff4487..95b30f1d8 100755
--- a/hi-ml/src/health_ml/runner.py
+++ b/hi-ml/src/health_ml/runner.py
@@ -50,7 +50,12 @@
 from health_ml.training_runner import TrainingRunner  # noqa: E402
 from health_ml.utils import fixed_paths  # noqa: E402
 from health_ml.utils.logging import ConsoleAndFileOutput  # noqa: E402
-from health_ml.utils.common_utils import check_conda_environment, choose_conda_env_file, is_linux  # noqa: E402
+from health_ml.utils.common_utils import (
+    check_conda_environment,
+    choose_conda_env_file,
+    get_memory_gb,
+    initialize_rpdb,
+)  # noqa: E402
 from health_ml.utils.config_loader import ModelConfigLoader  # noqa: E402
 from health_ml.utils import health_ml_package_setup  # noqa: E402
 
@@ -62,24 +67,6 @@
 sys.argv[0] = str(runner_path.resolve())
 
 
-def initialize_rpdb() -> None:
-    """
-    On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
-    """
-    # rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
-    if not is_linux():
-        return
-    import rpdb
-
-    rpdb_port = 4444
-    rpdb.handle_trap(port=rpdb_port)
-    # For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
-    logging.info(
-        "rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
-        f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
-    )
-
-
 def create_runner_parser() -> argparse.ArgumentParser:
     """
     Creates a commandline parser, that understands all necessary arguments for training a model
@@ -191,9 +178,8 @@ def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
         # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
         log_level = logging.INFO if is_local_rank_zero() else logging.ERROR
         logging_to_stdout(log_level)
-        # When running in Azure, also output logging to a file. This can help in particular when jobs
-        # get preempted, but we don't get access to the logs from the previous incarnation of the job
         initialize_rpdb()
+        get_memory_gb(print_stats=True)
         self.parse_and_load_model()
         self.validate()
         azure_run_info = self.submit_to_azureml_if_needed()
@@ -386,6 +372,8 @@ def run_with_logging(project_root: Path) -> Tuple[LightningContainer, AzureRunIn
     Start the main main entry point for training and testing models from the commandline.
     When running in Azure, this method also redirects the stdout stream, so that all console output is visible both on
     the console and stored in a file. The filename is timestamped and contains the DDP rank of the current process.
+    This can help in particular when jobs get preempted, but we don't get access to the logs from the previous
+    incarnation of the job.
 
     :param project_root: The root folder that contains all of the source code that should be executed.
     :return: If submitting to AzureML, returns the model configuration that was used for training,
diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py
index bf28d928d..ab10a5537 100644
--- a/hi-ml/src/health_ml/utils/common_utils.py
+++ b/hi-ml/src/health_ml/utils/common_utils.py
@@ -3,12 +3,13 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 
+import logging
 import os
 from contextlib import contextmanager
 from datetime import datetime
 from enum import Enum, unique
 from pathlib import Path
-from typing import Any, Generator, List, Optional
+from typing import Any, Generator, List, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -19,10 +20,12 @@
 
 MAX_PATH_LENGTH = 260
 
-# convert string to None if an empty string or whitespace is provided
+
+logger = logging.getLogger(__name__)
 
 
 def empty_string_to_none(x: Optional[str]) -> Optional[str]:
+    """convert string to None if an empty string or whitespace is provided"""
     return None if (x is None or len(x.strip()) == 0) else x
 
 
@@ -226,3 +229,46 @@ def seed_monai_if_available(seed: int) -> None:
         set_determinism(seed=seed)
     except ImportError:
         pass
+
+
+def initialize_rpdb() -> None:
+    """
+    On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
+    """
+    # rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
+    if not is_linux():
+        return
+    import rpdb
+
+    rpdb_port = 4444
+    rpdb.handle_trap(port=rpdb_port)
+    # For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
+    logger.info(
+        "rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
+        f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
+    )
+
+
+def get_memory_gb(print_stats: bool = False) -> Optional[Tuple[float, float, float, float]]:
+    """Get the CPU memory, available CPU memory, total memory (CPU plus swap) and available memory in GB.
+    This relies on the Linux 'free' command being available. The function returns None if the command is not available.
+
+    :param print_stats: If True, print the output of the 'free' command to stdout.
+    :return: Tuple of (CPU memory, CPU memory available, total memory, total memory available), all in GB.
+    """
+    free_commandline = "free -t -m"
+    try:
+        free_output = os.popen(free_commandline).readlines()
+    except:
+        return None
+    if print_stats:
+        print(f"Checking available memory. Result of running '{free_commandline}':")
+        for line in free_output:
+            # Lines still contain a newline at the end, so no need to add that
+            print(line, end="")
+    if len(free_output) < 4:
+        return None
+    cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4])
+    total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4])
+    MB_per_GB = 1024.0
+    return tuple(map(lambda x: round(x / MB_per_GB, 3), (cpu_mem, cpu_mem_available, total_mem, total_mem_available)))
diff --git a/hi-ml/testhiml/testhiml/test_common_utils.py b/hi-ml/testhiml/testhiml/test_common_utils.py
index 692e7ab09..e903d88e5 100644
--- a/hi-ml/testhiml/testhiml/test_common_utils.py
+++ b/hi-ml/testhiml/testhiml/test_common_utils.py
@@ -2,12 +2,14 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+from io import StringIO
 from pathlib import Path
 from unittest.mock import patch
 import os
 import pytest
 
 from health_ml.utils import common_utils
+from health_ml.utils.common_utils import get_memory_gb, is_linux
 
 
 @pytest.mark.parametrize("os_name, expected_val", [("nt", True), ("None", False), ("posix", False), ("", False)])
@@ -34,3 +36,50 @@ def test_change_working_directory(tmp_path: Path) -> None:
         assert str(Path.cwd()) == tmp_path_str
     # outside of the context, the original working directory should be restored
     assert str(Path.cwd()) == orig_cwd_str != tmp_path_str
+
+
+@pytest.mark.skipif(not is_linux(), reason="Test only runs on Linux")
+def test_available_memory(capsys: pytest.CaptureFixture) -> None:
+    """
+    Test that get_memory_gb returns a value greater than 0
+    """
+    # All tests run on Linux, so the result should be available
+
+    result = get_memory_gb(print_stats=False)
+    assert result is not None
+    assert len(result) == 4
+    for val in result:
+        assert isinstance(val, float)
+        assert val > 0.0
+    stdout: str = capsys.readouterr().out
+    assert len(stdout) == 0
+
+
+@pytest.mark.skipif(not is_linux(), reason="Test only runs on Linux")
+def test_available_memory_prints(capsys: pytest.CaptureFixture) -> None:
+    """
+    Test that get_memory_gb prints the result of running 'free'
+    """
+    # All tests run on Linux, so the result should be available
+
+    get_memory_gb(print_stats=True)
+    stdout: str = capsys.readouterr().out
+    assert len(stdout.splitlines()) == 5
+
+
+def test_available_memory_reads_correctly(capsys: pytest.CaptureFixture) -> None:
+    """
+    Test that get_memory_gb picks the right fields of the output of 'free'
+    """
+    free_output = """              total        used        free      shared  buff/cache   available
+Mem:           9950        3316        5133           2        1500        6332
+Swap:          3072          15        3056
+Total:        13022        3331        8190
+"""
+    with patch("os.popen", return_value=StringIO(free_output)):
+        result = get_memory_gb(print_stats=True)
+        assert result is not None
+        assert len(result) == 4
+        assert result == (9.717, 5.013, 12.717, 7.998)
+        stdout: str = capsys.readouterr().out
+        assert free_output in stdout

From 81fcc1d662b26e55b0b2b2630d479a3b3cbeb6c8 Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 11:28:23 +0100
Subject: [PATCH 5/9] tests

---
 hi-ml/src/health_ml/configs/hello_world.py  | 20 ++++++++++++++++++++
 hi-ml/src/health_ml/deep_learning_config.py |  3 +++
 hi-ml/src/health_ml/experiment_config.py    |  1 -
 hi-ml/src/health_ml/runner.py               |  2 +-
 hi-ml/testhiml/testhiml/test_runner.py      |  2 +-
 5 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
index a9cc8562a..ef0f16bbe 100644
--- a/hi-ml/src/health_ml/configs/hello_world.py
+++ b/hi-ml/src/health_ml/configs/hello_world.py
@@ -2,6 +2,7 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -15,6 +16,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from health_ml.lightning_container import LightningContainer
+from health_ml.utils.common_utils import get_memory_gb
 
 TEST_MSE_FILE = "test_mse.txt"
 TEST_MAE_FILE = "test_mae.txt"
@@ -291,3 +293,21 @@ def get_callbacks(self) -> List[Callback]:
 
     def get_additional_aml_run_tags(self) -> Dict[str, str]:
         return {"max_epochs": str(self.max_epochs)}
+
+
+class HelloWorldWithMemoryCheck(HelloWorld):
+    """
+    A variant of the HelloWorld container that checks that there is enough memory available to run the model.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.docker_shm_size_gb = 20
+        self.docker_shm_size = f"{self.docker_shm_size_gb}g"
+
+    def before_training_on_global_rank_zero(self) -> None:
+        cpu_memory, _, _, _ = get_memory_gb(print_stats=True)
+        if cpu_memory < self.docker_shm_size_gb:
+            raise ValueError(
+                f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {cpu_memory} GB"
+            )
diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py
index 5f56c9006..51fda60e7 100644
--- a/hi-ml/src/health_ml/deep_learning_config.py
+++ b/hi-ml/src/health_ml/deep_learning_config.py
@@ -168,6 +168,9 @@ class WorkflowParams(param.Parameterized):
     This class contains all parameters that affect how the whole training and testing workflow is executed.
     """
 
+    docker_shm_size: str = param.String(
+        "400g", doc="The Docker shared memory size that is required to run this model in AzureML."
+    )
     random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
     src_checkpoint: CheckpointParser = param.ClassSelector(
         class_=CheckpointParser, default=None, instantiate=False, doc=CheckpointParser.DOC
diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py
index b63baf963..11c62b188 100644
--- a/hi-ml/src/health_ml/experiment_config.py
+++ b/hi-ml/src/health_ml/experiment_config.py
@@ -43,7 +43,6 @@ class ExperimentConfig(param.Parameterized):
         "over the network). When running outside AzureML, datasets will "
         "always be mounted.",
     )
-    docker_shm_size: str = param.String("400g", doc="The shared memory in the Docker image for the AzureML VMs.")
     wait_for_completion: bool = param.Boolean(
         default=False,
         doc="If True, wait for AML Run to complete before proceeding. If False, submit the run to AML and exit",
diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
index 95b30f1d8..de5aa57e0 100755
--- a/hi-ml/src/health_ml/runner.py
+++ b/hi-ml/src/health_ml/runner.py
@@ -264,7 +264,7 @@ def submit_to_azureml_if_needed(self) -> AzureRunInfo:
                 ignored_folders=[],
                 submit_to_azureml=bool(self.experiment_config.cluster),
                 docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
-                docker_shm_size=self.experiment_config.docker_shm_size,
+                docker_shm_size=self.lightning_container.docker_shm_size,
                 hyperdrive_config=hyperdrive_config,
                 hyperparam_args=hyperparam_args,
                 display_name=self.lightning_container.tag,
diff --git a/hi-ml/testhiml/testhiml/test_runner.py b/hi-ml/testhiml/testhiml/test_runner.py
index 7ad39b903..f22528b23 100644
--- a/hi-ml/testhiml/testhiml/test_runner.py
+++ b/hi-ml/testhiml/testhiml/test_runner.py
@@ -374,7 +374,7 @@ def test_submit_to_azure_docker(mock_runner: Runner) -> None:
             # call_args is a tuple of (args, kwargs)
             call_kwargs = mock_submit_to_aml.call_args[1]
             # Submission to AzureML should have been turned on because a cluster name was supplied
-            assert mock_runner.experiment_config.docker_shm_size == docker_shm_size
+            assert mock_runner.lightning_container.docker_shm_size == docker_shm_size
             assert call_kwargs["docker_shm_size"] == docker_shm_size
 
 

From 56aa4882e19d268ae4ba7f40b9fd95d8aa3f0fbd Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 11:34:05 +0100
Subject: [PATCH 6/9] mypy

---
 hi-ml/src/health_ml/configs/hello_world.py | 4 +++-
 hi-ml/src/health_ml/utils/common_utils.py  | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
index ef0f16bbe..307397044 100644
--- a/hi-ml/src/health_ml/configs/hello_world.py
+++ b/hi-ml/src/health_ml/configs/hello_world.py
@@ -306,7 +306,9 @@ def __init__(self) -> None:
         self.docker_shm_size = f"{self.docker_shm_size_gb}g"
 
     def before_training_on_global_rank_zero(self) -> None:
-        cpu_memory, _, _, _ = get_memory_gb(print_stats=True)
+        memory = get_memory_gb(print_stats=True)
+        assert memory is not None
+        cpu_memory, _, _, _ = memory
         if cpu_memory < self.docker_shm_size_gb:
             raise ValueError(
                 f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {cpu_memory} GB"
diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py
index ab10a5537..0824e795a 100644
--- a/hi-ml/src/health_ml/utils/common_utils.py
+++ b/hi-ml/src/health_ml/utils/common_utils.py
@@ -271,4 +271,5 @@ def get_memory_gb(print_stats: bool = False) -> Optional[Tuple[float, float, flo
     cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4])
     total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4])
     MB_per_GB = 1024.0
-    return tuple(map(lambda x: round(x / MB_per_GB, 3), (cpu_mem, cpu_mem_available, total_mem, total_mem_available)))
+    values = (cpu_mem, cpu_mem_available, total_mem, total_mem_available)
+    return tuple(map(lambda x: round(x / MB_per_GB, 3), values))  # type: ignore

From 01edcab8a0fe60ced9dc87488d0d6ab03749b69b Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 11:58:11 +0100
Subject: [PATCH 7/9] use memory check model for smoktests

---
 hi-ml/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hi-ml/Makefile b/hi-ml/Makefile
index b8df80ea9..645574e50 100644
--- a/hi-ml/Makefile
+++ b/hi-ml/Makefile
@@ -105,9 +105,13 @@ smoke_helloworld_v2_2nodes:
 	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --num_nodes=2 --tag smoke_helloworld_v2_2nodes ${SHARED_ARGS}
 
 # HelloWorld model training via crossvalidation, submitted using the v1 SDK
+# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
+# Hyperdrive jobs
 smoke_helloworld_v1_crossval:
-	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS}
+	python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS}
 
 # HelloWorld model training via crossvalidation, submitted using the v2 SDK
+# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
+# Hyperdrive jobs
 smoke_helloworld_v2_crossval:
-	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS}
+	python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS}

From dda1ac0d249b7768f7e8d1ae77af4e0e4110870d Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 18:09:40 +0100
Subject: [PATCH 8/9] docker memory

---
 hi-ml/src/health_ml/configs/hello_world.py   |  2 +-
 hi-ml/src/health_ml/runner.py                |  4 +-
 hi-ml/src/health_ml/utils/common_utils.py    | 43 +++++++++++++++++---
 hi-ml/testhiml/testhiml/test_common_utils.py |  6 +--
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
index 307397044..a85401873 100644
--- a/hi-ml/src/health_ml/configs/hello_world.py
+++ b/hi-ml/src/health_ml/configs/hello_world.py
@@ -306,7 +306,7 @@ def __init__(self) -> None:
         self.docker_shm_size = f"{self.docker_shm_size_gb}g"
 
     def before_training_on_global_rank_zero(self) -> None:
-        memory = get_memory_gb(print_stats=True)
+        memory = get_memory_gb(verbose=True)
         assert memory is not None
         cpu_memory, _, _, _ = memory
         if cpu_memory < self.docker_shm_size_gb:
diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
index de5aa57e0..ea1d30ab2 100755
--- a/hi-ml/src/health_ml/runner.py
+++ b/hi-ml/src/health_ml/runner.py
@@ -53,6 +53,7 @@
 from health_ml.utils.common_utils import (
     check_conda_environment,
     choose_conda_env_file,
+    get_docker_memory_gb,
     get_memory_gb,
     initialize_rpdb,
 )  # noqa: E402
@@ -179,7 +180,8 @@ def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
         log_level = logging.INFO if is_local_rank_zero() else logging.ERROR
         logging_to_stdout(log_level)
         initialize_rpdb()
-        get_memory_gb(print_stats=True)
+        get_memory_gb(verbose=True)
+        get_docker_memory_gb(verbose=True)
         self.parse_and_load_model()
         self.validate()
         azure_run_info = self.submit_to_azureml_if_needed()
diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py
index 0824e795a..481cbf6ad 100644
--- a/hi-ml/src/health_ml/utils/common_utils.py
+++ b/hi-ml/src/health_ml/utils/common_utils.py
@@ -249,27 +249,60 @@ def initialize_rpdb() -> None:
     )
 
 
-def get_memory_gb(print_stats: bool = False) -> Optional[Tuple[float, float, float, float]]:
-    """Get the CPU memory, available CPU memory, total memory (CPU plus swap) and available memory in GB.
+def _is_running_in_docker() -> bool:
+    """Return True if the present process is likely to run inside a Docker container"""
+    return Path("/.dockerenv").exists()
+
+
+def get_docker_memory_gb(verbose: bool = True) -> Optional[float]:
+    """Get the total amount of memory when running in a Docker container. If the process does not
+    appear to run in a Docker container, return None.
+
+    :param verbose: If True, print the amount of total Docker memory to stdout.
+    :return: The total amount of Docker memory in GB, or None if the process is not running in Docker.
+    """
+    if is_linux() and _is_running_in_docker():
+        mem_limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
+        try:
+            mem_limit = int(mem_limit_path.read_text().strip())
+        except:
+            logger.warning(f"Unable to read {mem_limit_path} or process the contents.")
+            return None
+        byte_per_GB = 1024.0**3
+        docker_gb = round(mem_limit / byte_per_GB, 3)
+        if verbose:
+            print(f"Total Docker memory: {docker_gb} GB")
+        return docker_gb
+
+    if verbose:
+        print("Unable to determine Docker memory size because the process does not appear to run in Docker.")
+    return None
+
+
+def get_memory_gb(verbose: bool = False) -> Optional[Tuple[float, float, float, float]]:
+    """Get the CPU memory, available CPU memory, total memory (CPU plus swap), available memory in GB.
     This relies on the Linux 'free' command being available. The function returns None if the command is not available.
 
-    :param print_stats: If True, print the output of the 'free' command to stdout.
+    :param verbose: If True, print the output of the 'free' command to stdout.
     :return: Tuple of (CPU memory, CPU memory available, total memory, total memory available), all in GB.
     """
     free_commandline = "free -t -m"
     try:
         free_output = os.popen(free_commandline).readlines()
     except:
+        logger.warning(f"Unable to run '{free_commandline}'")
         return None
-    if print_stats:
+    if verbose:
         print(f"Checking available memory. Result of running '{free_commandline}':")
         for line in free_output:
             # Lines still contain a newline at the end, so no need to add that
             print(line, end="")
     if len(free_output) < 4:
+        logger.warning(f"Unexpected result when running '{free_commandline}': {free_output}")
         return None
     cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4])
     total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4])
     MB_per_GB = 1024.0
     values = (cpu_mem, cpu_mem_available, total_mem, total_mem_available)
-    return tuple(map(lambda x: round(x / MB_per_GB, 3), values))  # type: ignore
+    result = tuple(map(lambda x: round(x / MB_per_GB, 3), values))
+    return result  # type: ignore
diff --git a/hi-ml/testhiml/testhiml/test_common_utils.py b/hi-ml/testhiml/testhiml/test_common_utils.py
index e903d88e5..21c5b2399 100644
--- a/hi-ml/testhiml/testhiml/test_common_utils.py
+++ b/hi-ml/testhiml/testhiml/test_common_utils.py
@@ -45,7 +45,7 @@ def test_available_memory(capsys: pytest.CaptureFixture) -> None:
     """
     # All tests run on Linux, so the result should be available
 
-    result = get_memory_gb(print_stats=False)
+    result = get_memory_gb(verbose=False)
     assert result is not None
     assert len(result) == 4
     for val in result:
@@ -62,7 +62,7 @@ def test_available_memory_prints(capsys: pytest.CaptureFixture) -> None:
     """
     # All tests run on Linux, so the result should be available
 
-    get_memory_gb(print_stats=True)
+    get_memory_gb(verbose=True)
     stdout: str = capsys.readouterr().out
     assert len(stdout.splitlines()) == 5
 
@@ -77,7 +77,7 @@ def test_available_memory_reads_correctly(capsys: pytest.CaptureFixture) -> None
 Total:        13022        3331        8190
 """
     with patch("os.popen", return_value=StringIO(free_output)):
-        result = get_memory_gb(print_stats=True)
+        result = get_memory_gb(verbose=True)
         assert result is not None
         assert len(result) == 4
         assert result == (9.717, 5.013, 12.717, 7.998)

From 6d36fa7cd53f5c4b7949578e1b0bfcd8bd9bae99 Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Fri, 5 May 2023 18:54:04 +0100
Subject: [PATCH 9/9] tests and use in test model

---
 hi-ml/src/health_ml/configs/hello_world.py   | 11 +++--
 hi-ml/src/health_ml/utils/common_utils.py    |  4 +-
 hi-ml/testhiml/testhiml/test_common_utils.py | 44 +++++++++++++++++++-
 3 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
index a85401873..dd301596f 100644
--- a/hi-ml/src/health_ml/configs/hello_world.py
+++ b/hi-ml/src/health_ml/configs/hello_world.py
@@ -16,7 +16,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from health_ml.lightning_container import LightningContainer
-from health_ml.utils.common_utils import get_memory_gb
+from health_ml.utils.common_utils import get_docker_memory_gb
 
 TEST_MSE_FILE = "test_mse.txt"
 TEST_MAE_FILE = "test_mae.txt"
@@ -306,10 +306,9 @@ def __init__(self) -> None:
         self.docker_shm_size = f"{self.docker_shm_size_gb}g"
 
     def before_training_on_global_rank_zero(self) -> None:
-        memory = get_memory_gb(verbose=True)
-        assert memory is not None
-        cpu_memory, _, _, _ = memory
-        if cpu_memory < self.docker_shm_size_gb:
+        docker_memory = get_docker_memory_gb(verbose=True)
+        assert docker_memory is not None
+        if docker_memory < self.docker_shm_size_gb:
             raise ValueError(
-                f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {cpu_memory} GB"
+                f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {docker_memory} GB"
             )
diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py
index 481cbf6ad..6e94d87e3 100644
--- a/hi-ml/src/health_ml/utils/common_utils.py
+++ b/hi-ml/src/health_ml/utils/common_utils.py
@@ -254,7 +254,7 @@ def _is_running_in_docker() -> bool:
     return Path("/.dockerenv").exists()
 
 
-def get_docker_memory_gb(verbose: bool = True) -> Optional[float]:
+def get_docker_memory_gb(verbose: bool = False) -> Optional[float]:
     """Get the total amount of memory when running in a Docker container. If the process does not
     appear to run in a Docker container, return None.
 
@@ -293,7 +293,7 @@ def get_memory_gb(verbose: bool = False) -> Optional[Tuple[float, float, float,
         logger.warning(f"Unable to run '{free_commandline}'")
         return None
     if verbose:
-        print(f"Checking available memory. Result of running '{free_commandline}':")
+        print(f"Checking available memory. Result of running '{free_commandline}' (available memory in MB):")
         for line in free_output:
             # Lines still contain a newline at the end, so no need to add that
             print(line, end="")
diff --git a/hi-ml/testhiml/testhiml/test_common_utils.py b/hi-ml/testhiml/testhiml/test_common_utils.py
index 21c5b2399..62a32964f 100644
--- a/hi-ml/testhiml/testhiml/test_common_utils.py
+++ b/hi-ml/testhiml/testhiml/test_common_utils.py
@@ -4,12 +4,12 @@
 #  ------------------------------------------------------------------------------------------
 from io import StringIO
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 import os
 import pytest
 
 from health_ml.utils import common_utils
-from health_ml.utils.common_utils import get_memory_gb, is_linux
+from health_ml.utils.common_utils import get_docker_memory_gb, get_memory_gb, is_linux
 
 
 @pytest.mark.parametrize("os_name, expected_val", [("nt", True), ("None", False), ("posix", False), ("", False)])
@@ -83,3 +83,43 @@ def test_available_memory_reads_correctly(capsys: pytest.CaptureFixture) -> None
         assert result == (9.717, 5.013, 12.717, 7.998)
         stdout: str = capsys.readouterr().out
         assert free_output in stdout
+
+
+def test_docker_memory(capsys: pytest.CaptureFixture) -> None:
+    """Test that docker_memory returns the expected values"""
+    with patch.multiple(
+        "health_ml.utils.common_utils",
+        is_linux=MagicMock(return_value=True),
+        _is_running_in_docker=MagicMock(return_value=True),
+    ):
+        expected_gb = 1.234
+        with patch("pathlib.Path.read_text", return_value=str(int(expected_gb * 1024**3))):
+            # Test that the function returns the expected value, and that it prints nothing when verbose=False
+            result = get_docker_memory_gb(verbose=False)
+            assert result == expected_gb
+            stdout: str = capsys.readouterr().out
+            assert len(stdout.splitlines()) == 0
+            # Test that the function prints the expected value when verbose=True
+            result = get_docker_memory_gb(verbose=True)
+            assert result == expected_gb
+            stdout = capsys.readouterr().out
+            assert stdout.splitlines() == [f"Total Docker memory: {expected_gb} GB"]
+
+        with patch("pathlib.Path.read_text", side_effect=ValueError):
+            assert get_docker_memory_gb() is None
+
+
+def test_docker_memory_outside_docker(capsys: pytest.CaptureFixture) -> None:
+    """Test that docker_memory returns None if not running in Docker"""
+    with patch.multiple(
+        "health_ml.utils.common_utils",
+        is_linux=MagicMock(return_value=True),
+        _is_running_in_docker=MagicMock(return_value=False),
+    ):
+        assert get_docker_memory_gb() is None
+        stdout: str = capsys.readouterr().out
+        assert len(stdout.splitlines()) == 0
+        # Test that the function prints the expected value when verbose=True
+        assert get_docker_memory_gb(verbose=True) is None
+        stdout = capsys.readouterr().out
+        assert stdout.startswith("Unable to determine Docker memory")