diff --git a/heat/core/__init__.py b/heat/core/__init__.py index 5a506c572..82326ed5b 100644 --- a/heat/core/__init__.py +++ b/heat/core/__init__.py @@ -32,3 +32,4 @@ from . import version from .version import __version__ from .vmap import * +from ._config import * diff --git a/heat/core/_config.py b/heat/core/_config.py new file mode 100644 index 000000000..da327835a --- /dev/null +++ b/heat/core/_config.py @@ -0,0 +1,62 @@ +""" +Everything you need to know about the configuration of Heat +""" + +import torch +import platform +import mpi4py +import subprocess +import os +import warnings +import re + +PLATFORM = platform.platform() +MPI_LIBRARY_VERSION = mpi4py.MPI.Get_library_version() +TORCH_VERSION = torch.__version__ +TORCH_CUDA_IS_AVAILABLE = torch.cuda.is_available() +CUDA_IS_ACTUALLY_ROCM = "rocm" in TORCH_VERSION + +CUDA_AWARE_MPI = False +ROCM_AWARE_MPI = False + +# check whether there is CUDA- or ROCm-aware OpenMPI +try: + buffer = subprocess.check_output(["ompi_info", "--parsable", "--all"]) + CUDA_AWARE_MPI = b"mpi_built_with_cuda_support:value:true" in buffer + pattern = re.compile(r"^MPI extensions:.*", re.MULTILINE) + match = pattern.search(buffer) + ROCM_AWARE_MPI = "rocm" in match.group(0) +except: # noqa E722 + pass + +# do the same for MVAPICH +CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("MV2_USE_CUDA") == "1" +CUDA_AWARE_MPI = ROCM_AWARE_MPI or os.environ.get("MV2_USE_ROCM") == "1" + +# do the same for MPICH, TODO: outdated? +CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("MPIR_CVAR_ENABLE_HCOLL") == "1" + +# Cray MPICH +CUDA_AWARE_MPI = os.environ.get("MPICH_GPU_SUPPORT_ENABLED") == "1" +ROCM_AWARE_MPI = os.environ.get("MPICH_GPU_SUPPORT_ENABLED") == "1" + +# do the same for ParaStationMPI, seems to have CUDA-support only +CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("PSP_CUDA") == "1" + +# Intel-MPI? + +# warn the user if CUDA/ROCm-aware MPI is not available, but PyTorch can use GPUs with CUDA/ROCm +if TORCH_CUDA_IS_AVAILABLE: + if not CUDA_IS_ACTUALLY_ROCM and not CUDA_AWARE_MPI: + warnings.warn( + f"Heat has CUDA GPU-support (PyTorch version {TORCH_VERSION} and `torch.cuda.is_available() = True`), but CUDA-awareness of MPI could not be detected. This may lead to performance degradation as direct MPI-communication between GPUs is not possible.", + UserWarning, + ) + elif CUDA_IS_ACTUALLY_ROCM and not ROCM_AWARE_MPI: + warnings.warn( + f"Heat has ROCm GPU-support (PyTorch version {TORCH_VERSION} and `torch.cuda.is_available() = True`), but ROCm-awareness of MPI could not be detected. This may lead to performance degradation as direct MPI-communication between GPUs is not possible.", + UserWarning, + ) + GPU_AWARE_MPI = True +else: + GPU_AWARE_MPI = False diff --git a/heat/core/communication.py b/heat/core/communication.py index 5cefe5041..22890f2b5 100644 --- a/heat/core/communication.py +++ b/heat/core/communication.py @@ -7,25 +7,14 @@ import numpy as np import math import ctypes -import os -import subprocess import torch +import warnings from mpi4py import MPI from typing import Any, Callable, Optional, List, Tuple, Union from .stride_tricks import sanitize_axis -CUDA_AWARE_MPI = False -# check whether OpenMPI support CUDA-aware MPI -if "openmpi" in os.environ.get("MPI_SUFFIX", "").lower(): - buffer = subprocess.check_output(["ompi_info", "--parsable", "--all"]) - CUDA_AWARE_MPI = b"mpi_built_with_cuda_support:value:true" in buffer -# MVAPICH -CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("MV2_USE_CUDA") == "1" -# MPICH -CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("MPIR_CVAR_ENABLE_HCOLL") == "1" -# ParaStationMPI -CUDA_AWARE_MPI = CUDA_AWARE_MPI or os.environ.get("PSP_CUDA") == "1" +from ._config import CUDA_AWARE_MPI class MPIRequest: