microsoft
diff --git a/‎InnerEye/Common/common_util.py
Lines changed: 80 additions & 1 deletion b/‎InnerEye/Common/common_util.py
Lines changed: 80 additions & 1 deletion
diff --git a/‎InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py
Lines changed: 3 additions & 9 deletions b/‎InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py
Lines changed: 3 additions & 9 deletions
diff --git a/‎InnerEye/ML/configs/other/fastmri_varnet.py
Lines changed: 2 additions & 2 deletions b/‎InnerEye/ML/configs/other/fastmri_varnet.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎InnerEye/ML/lightning_base.py
Lines changed: 1 addition & 1 deletion b/‎InnerEye/ML/lightning_base.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎InnerEye/ML/model_training.py
Lines changed: 5 additions & 6 deletions b/‎InnerEye/ML/model_training.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎InnerEye/ML/run_ml.py
Lines changed: 29 additions & 23 deletions b/‎InnerEye/ML/run_ml.py
Lines changed: 29 additions & 23 deletions
diff --git a/‎InnerEye/ML/runner.py
Lines changed: 15 additions & 15 deletions b/‎InnerEye/ML/runner.py
Lines changed: 15 additions & 15 deletions
@@ -12,7 +12,14 @@
 from enum import Enum
 from functools import wraps
 from pathlib import Path
-from typing import Any, Callable, Generator, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
+
+import conda_merge
+import ruamel.yaml
+from health_azure.utils import (
+    CONDA_CHANNELS, CONDA_DEPENDENCIES, CONDA_NAME, CONDA_PIP, CondaDependencies, PinnedOperator,
+    _log_conda_dependencies_stats, _retrieve_unique_deps, is_conda_file_with_pip_include, is_pip_include_dependency
+)
 
 from InnerEye.Common.fixed_paths import repository_root_directory
 from InnerEye.Common.type_annotations import PathOrString
@@ -427,3 +434,75 @@ def change_working_directory(path_or_str: PathOrString) -> Generator:
     os.chdir(new_path)
     yield
     os.chdir(old_path)
+
+
+def merge_conda_files(
+    conda_files: List[Path],
+    result_file: Path,
+    pip_files: Optional[List[Path]] = None,
+) -> None:
+    """
+    Merges the given Conda environment files using the conda_merge package, optionally adds any
+    dependencies from pip requirements files, and writes the merged file to disk.
+
+    :param conda_files: The Conda environment files to read.
+    :param result_file: The location where the merge results should be written.
+    :param pip_files: An optional list of one or more pip requirements files including extra dependencies.
+    """
+    env_definitions: List[Any] = []
+    for file in conda_files:
+        _, pip_without_include = is_conda_file_with_pip_include(file)
+        env_definitions.append(pip_without_include)
+    unified_definition = {}
+
+    extra_pip_deps = []
+    for pip_file in pip_files or []:
+        additional_pip_deps = [d for d in pip_file.read_text().split("\n") if d and not is_pip_include_dependency(d)]
+        extra_pip_deps.extend(additional_pip_deps)
+
+    name = conda_merge.merge_names(env.get(CONDA_NAME) for env in env_definitions)
+    if name:
+        unified_definition[CONDA_NAME] = name
+
+    try:
+        channels = conda_merge.merge_channels(env.get(CONDA_CHANNELS) for env in env_definitions)
+    except conda_merge.MergeError:
+        logging.error("Failed to merge channel priorities.")
+        raise
+    if channels:
+        unified_definition[CONDA_CHANNELS] = channels
+
+    try:
+        deps_to_merge = [env.get(CONDA_DEPENDENCIES) for env in env_definitions]
+        if len(extra_pip_deps) > 0:
+            deps_to_merge.append([{CONDA_PIP: extra_pip_deps}])
+        deps = conda_merge.merge_dependencies(deps_to_merge)
+
+        # Get conda dependencies and pip dependencies from specification
+        pip_deps_entries = [d for d in deps if isinstance(d, dict) and CONDA_PIP in d]  # type: ignore
+        if len(pip_deps_entries) == 0:
+            raise ValueError("Didn't find a dictionary with the key 'pip' in the list of dependencies")
+        pip_deps_entry: Dict[str, List[str]] = pip_deps_entries[0]
+        pip_deps = pip_deps_entry[CONDA_PIP]
+        # temporarily remove pip dependencies from deps to be added back after deduplicaton
+        deps.remove(pip_deps_entry)
+
+        # remove all non-pip duplicates from the list of dependencies
+        unique_deps = _retrieve_unique_deps(deps, PinnedOperator.CONDA)
+
+        unique_pip_deps = sorted(_retrieve_unique_deps(pip_deps, PinnedOperator.PIP))
+
+        # finally add back the deduplicated list of dependencies
+        unique_deps.append({CONDA_PIP: unique_pip_deps})  # type: ignore
+
+    except conda_merge.MergeError:
+        logging.error("Failed to merge dependencies.")
+        raise
+    if unique_deps:
+        unified_definition[CONDA_DEPENDENCIES] = unique_deps
+    else:
+        raise ValueError("No dependencies found in any of the conda files.")
+
+    with result_file.open("w") as f:
+        ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
+    _log_conda_dependencies_stats(CondaDependencies(result_file), "Merged Conda environment")
@@ -97,16 +97,10 @@ def on_pretrain_routine_start(self, trainer: pl.Trainer, pl_module: pl.Lightning
                                       p=self.drop_p,
                                       n_hidden=self.hidden_dim)
         self.evaluator.to(pl_module.device)
-        if hasattr(trainer, "accelerator_connector"):
-            # This works with Lightning 1.3.8
-            accelerator = trainer.accelerator_connector
-        elif hasattr(trainer, "_accelerator_connector"):
-            # This works with Lightning 1.5.5
-            accelerator = trainer._accelerator_connector
-        else:
-            raise ValueError("Unable to retrieve the accelerator information")
+        accelerator = trainer._accelerator_connector
+
         if accelerator.is_distributed:
-            if accelerator.use_ddp:
+            if accelerator.strategy.strategy_name == "ddp":
                 self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
                 self.evaluator = DistributedDataParallel(self.evaluator, device_ids=[pl_module.device])  # type: ignore
             else:
 
@@ -32,8 +32,8 @@ class VarNetWithImageLogging(VarNetModule):
     """
 
     def log_image(self, name: str, image: torch.Tensor) -> None:
-        experiments = self.logger.experiment if isinstance(self.logger.experiment, list) \
-            else [self.logger.experiment]
+        experiments = self.loggers[0].experiment if isinstance(self.loggers[0].experiment, list) \
+            else [self.loggers[0].experiment]
         for experiment in experiments:
             if isinstance(experiment, SummaryWriter):
                 experiment.add_image(name, image, global_step=self.global_step)
 
@@ -289,7 +289,7 @@ def on_train_end(self) -> None:
         This hook is called at the very end of training. Use that to write the very last set of training and
         validation metrics from the StoringLogger to disk.
         """
-        self.read_epoch_results_from_logger_and_store(epoch=self.current_epoch)
+        self.read_epoch_results_from_logger_and_store(epoch=self.current_epoch-1)
 
     @rank_zero_only
     def read_epoch_results_from_logger_and_store(self, epoch: int) -> None:
 
@@ -120,7 +120,7 @@ def create_lightning_trainer(container: LightningContainer,
                                                save_top_k=0)
     recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
                                                    filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
-                                                   every_n_val_epochs=container.autosave_every_n_val_epochs,
+                                                   every_n_epochs=container.autosave_every_n_val_epochs,
                                                    save_last=False)
     callbacks: List[Callback] = [
         last_checkpoint_callback,
@@ -264,11 +264,10 @@ def model_train(checkpoint_path: Optional[Path],
         lightning_model.storing_logger = storing_logger
 
     logging.info("Starting training")
-    # When training models that are not built-in InnerEye models, we have no guarantee that they write
-    # files to the right folder. Best guess is to change the current working directory to where files should go.
-    with change_working_directory(container.outputs_folder):
-        trainer.fit(lightning_model, datamodule=data_module)
-        trainer.logger.close()  # type: ignore
+
+    trainer.fit(lightning_model, datamodule=data_module)
+    trainer.logger.close()  # type: ignore
+
     world_size = getattr(trainer, "world_size", 0)
     is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
     # Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
 
@@ -15,33 +15,37 @@
 import torch.multiprocessing
 from azureml._restclient.constants import RunStatus
 from azureml.core import Model, Run, model
+from health_azure import AzureRunInfo
+from health_azure.utils import ENVIRONMENT_VERSION, create_run_recovery_id, is_global_rank_zero
 from pytorch_lightning import LightningModule, seed_everything
 from pytorch_lightning.core.datamodule import LightningDataModule
 from torch.utils.data import DataLoader
 
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig
 from InnerEye.Azure.azure_runner import ENV_OMPI_COMM_WORLD_RANK, get_git_tags
-from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, \
-    EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, \
-    PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, \
-    get_all_environment_files, is_offline_run_context
+from InnerEye.Azure.azure_util import (
+    CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, EFFECTIVE_RANDOM_SEED_KEY_NAME,
+    IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT,
+    RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files, is_offline_run_context
+)
 from InnerEye.Common import fixed_paths
-from InnerEye.Common.common_util import (BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE,
-                                         CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE,
-                                         METRICS_AGGREGATES_FILE, ModelProcessing,
-                                         OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME,
-                                         change_working_directory, get_best_epoch_results_path, is_windows,
-                                         logging_section, print_exception, remove_file_or_directory)
+from InnerEye.Common.common_util import (
+    BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME,
+    FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME,
+    SUBJECT_METRICS_FILE_NAME, ModelProcessing, change_working_directory, get_best_epoch_results_path,
+    is_windows, logging_section, merge_conda_files, print_exception, remove_file_or_directory
+)
 from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PYTHON_ENVIRONMENT_NAME
 from InnerEye.Common.type_annotations import PathOrString
 from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
-from InnerEye.ML.common import CHECKPOINT_FOLDER, EXTRA_RUN_SUBFOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, \
-    FINAL_MODEL_FOLDER, \
-    ModelExecutionMode
+from InnerEye.ML.common import (
+    CHECKPOINT_FOLDER, EXTRA_RUN_SUBFOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelExecutionMode
+)
 from InnerEye.ML.config import SegmentationModelBase
-from InnerEye.ML.deep_learning_config import DeepLearningConfig, ModelCategory, MultiprocessingStartMethod, \
-    load_checkpoint
+from InnerEye.ML.deep_learning_config import (
+    DeepLearningConfig, ModelCategory, MultiprocessingStartMethod, load_checkpoint
+)
 from InnerEye.ML.lightning_base import InnerEyeContainer
 from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
 from InnerEye.ML.lightning_loggers import StoringLogger
@@ -50,16 +54,16 @@
 from InnerEye.ML.model_inference_config import ModelInferenceConfig
 from InnerEye.ML.model_testing import model_test
 from InnerEye.ML.model_training import create_lightning_trainer, model_train
-from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
-    generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \
-    get_ipynb_report_name, reports_folder
+from InnerEye.ML.reports.notebook_report import (
+    generate_classification_crossval_notebook, generate_classification_multilabel_notebook,
+    generate_classification_notebook, generate_segmentation_notebook, get_ipynb_report_name, reports_folder
+)
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler, download_all_checkpoints_from_run
 from InnerEye.ML.visualizers import activation_maps
-from InnerEye.ML.visualizers.plot_cross_validation import \
+from InnerEye.ML.visualizers.plot_cross_validation import (
     get_config_and_results_for_offline_runs, plot_cross_validation_from_files
-from health_azure import AzureRunInfo
-from health_azure.utils import ENVIRONMENT_VERSION, create_run_recovery_id, is_global_rank_zero, merge_conda_files
+)
 
 ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any]
 PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
@@ -797,8 +801,10 @@ def create_ensemble_model_and_run_inference(self) -> None:
         remove_file_or_directory(other_runs_dir)
 
     def plot_cross_validation_and_upload_results(self) -> Path:
-        from InnerEye.ML.visualizers.plot_cross_validation import crossval_config_from_model_config, \
-            plot_cross_validation, unroll_aggregate_metrics
+        from InnerEye.ML.visualizers.plot_cross_validation import (
+            crossval_config_from_model_config, plot_cross_validation, unroll_aggregate_metrics
+        )
+
         # perform aggregation as cross val splits are now ready
         plot_crossval_config = crossval_config_from_model_config(self.innereye_config)
         plot_crossval_config.run_recovery_id = PARENT_RUN_CONTEXT.tags[RUN_RECOVERY_ID_KEY_NAME]
 
@@ -24,36 +24,36 @@
 # in a submodule
 fixed_paths.add_submodules_to_path()
 
+import matplotlib
 from azureml._base_sdk_common import user_agent
 from azureml._restclient.constants import RunStatus
 from azureml.core import Run, ScriptRunConfig
 from health_azure import AzureRunInfo, submit_to_azure_if_needed
-from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
-    to_azure_friendly_string
-import matplotlib
+from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, to_azure_friendly_string
 
-from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_runner import (DEFAULT_DOCKER_BASE_IMAGE, create_dataset_configs, create_experiment_name,
-                                         create_runner_parser,
-                                         get_git_tags,
-                                         parse_args_and_add_yaml_variables,
-                                         parse_arguments, additional_run_tags,
-                                         set_environment_variables_for_multi_node)
-from InnerEye.Azure.azure_util import (RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files,
-                                       is_offline_run_context)
+from InnerEye.Azure.azure_runner import (
+    DEFAULT_DOCKER_BASE_IMAGE, additional_run_tags, create_dataset_configs,
+    create_experiment_name, create_runner_parser, get_git_tags,
+    parse_args_and_add_yaml_variables, parse_arguments, set_environment_variables_for_multi_node
+)
+from InnerEye.Azure.azure_util import (
+    RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files, is_offline_run_context
+)
 from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
-from InnerEye.Common.common_util import (FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE,
-                                         is_linux, logging_to_stdout)
+from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
+from InnerEye.Common.common_util import (
+    FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, is_linux, logging_to_stdout, merge_conda_files
+)
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
 from InnerEye.ML.lightning_base import InnerEyeContainer
+from InnerEye.ML.lightning_container import LightningContainer
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.run_ml import MLRunner, ModelDeploymentHookSignature, PostCrossValidationHookSignature
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
-from InnerEye.ML.lightning_container import LightningContainer
 
 # We change the current working directory before starting the actual training. However, this throws off starting
 # the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute