diff --git a/CMakeLists.txt b/CMakeLists.txt
index 219abe707f..ecbe09895e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ endif (__GIT_EXECUTABLE)
# This must be set because version tags
set(HYDROGEN_VERSION_MAJOR 1)
set(HYDROGEN_VERSION_MINOR 5)
-set(HYDROGEN_VERSION_PATCH 2)
+set(HYDROGEN_VERSION_PATCH 4)
set(HYDROGEN_VERSION_MAJOR_MINOR
"${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}")
set(HYDROGEN_VERSION
@@ -101,10 +101,7 @@ if (NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE)
endif ()
# Feature-related options
-
-option(Hydrogen_ENABLE_ALUMINUM
- "Enable the Aluminum package for improved device-side communication."
- OFF)
+include(CMakeDependentOption)
option(Hydrogen_ENABLE_CUDA
"Search for CUDA support and enable related features if found."
@@ -114,6 +111,12 @@ option(Hydrogen_ENABLE_ROCM
"Search for ROCm/HIP support and enable related features if found."
OFF)
+cmake_dependent_option(Hydrogen_ENABLE_ROCTRACER
+ "Search for Roctracer and enable related features if found."
+ OFF
+ "Hydrogen_ENABLE_ROCM"
+ OFF)
+
if (Hydrogen_ENABLE_ROCM AND Hydrogen_ENABLE_CUDA)
message(FATAL_ERROR
"ROCm and CUDA code paths are mutually exclusive. "
@@ -313,10 +316,6 @@ if (Hydrogen_ENABLE_CUDA)
set(HYDROGEN_GPU_USE_TENSOR_OP_MATH TRUE)
endif ()
- if (Hydrogen_ENABLE_GPU_FP16)
- set(HYDROGEN_GPU_USE_FP16 TRUE)
- endif ()
-
if (Hydrogen_ENABLE_CUB)
if (CUDAToolkit_VERSION_MAJOR LESS 11)
find_package(CUB MODULE REQUIRED)
@@ -358,74 +357,102 @@ if (Hydrogen_ENABLE_ROCM)
find_package(rocsolver CONFIG REQUIRED)
find_package(rocthrust CONFIG REQUIRED)
+ if (Hydrogen_ENABLE_ROCTRACER)
+ find_package(Roctracer MODULE COMPONENTS roctx)
+ set(HYDROGEN_HAVE_ROCTRACER ${Roctracer_FOUND})
+ endif ()
+
+ include(HydrogenCleanupHIPTargets)
+ h_clean_hip_targets()
+
set(HYDROGEN_HAVE_ROCM TRUE)
message(STATUS "Found ROCm/HIP toolchain. Using HIP/ROCm.")
set(H_ROCM_CXX_LIBS
hip::host
- hip::hipcub
roc::rocblas
roc::rocsolver
- roc::rocthrust)
-
+ roc::rocthrust
+ ${Roctracer_LIBRARIES})
+ if (HYDROGEN_HAVE_CUB)
+ list(APPEND H_ROCM_CXX_LIBS hip::hipcub)
+ endif ()
set(H_ROCM_HIP_LIBS
hip::device)
endif (Hydrogen_ENABLE_ROCM)
if (HYDROGEN_HAVE_CUDA OR HYDROGEN_HAVE_ROCM)
set(HYDROGEN_HAVE_GPU TRUE)
+
+ if (Hydrogen_ENABLE_GPU_FP16)
+ set(HYDROGEN_GPU_USE_FP16 TRUE)
+ endif ()
endif ()
-if (Hydrogen_ENABLE_ALUMINUM)
- find_package(Aluminum 1.0.0 CONFIG QUIET)
+find_package(Aluminum 1.0.0 CONFIG QUIET)
+if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
+ message(STATUS
+ "A candidate Aluminum > v1.0.0 was found, but was not selected:")
+ message(STATUS
+ " ${Aluminum_NOT_FOUND_MESSAGE}")
+endif ()
+# Try again, since we're technically ok with >v0.7.0
+if (NOT Aluminum_FOUND)
+ find_package(Aluminum 0.7.0 CONFIG QUIET)
if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
message(STATUS
- "A candidate Aluminum > v1.0.0 was found, but was not selected:")
+ "A candidate Aluminum > v0.7.0 was found, but was not selected:")
message(STATUS
" ${Aluminum_NOT_FOUND_MESSAGE}")
endif ()
- # Try again, since we're technically ok with >v0.7.0
- if (NOT Aluminum_FOUND)
- find_package(Aluminum 0.7.0 CONFIG QUIET)
- if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
- message(STATUS
- "A candidate Aluminum > v0.7.0 was found, but was not selected:")
- message(STATUS
- " ${Aluminum_NOT_FOUND_MESSAGE}")
- endif ()
- endif ()
+endif ()
- if (Aluminum_FOUND)
- set(HYDROGEN_HAVE_ALUMINUM TRUE)
- message(STATUS
- "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}")
+if (Aluminum_FOUND)
+ set(HYDROGEN_HAVE_ALUMINUM TRUE)
+ message(STATUS
+ "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}")
- if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
- set(HYDROGEN_HAVE_NCCL2 TRUE)
- message(STATUS "Aluminum detected with NCCL2 backend support.")
- else ()
- set(HYDROGEN_HAVE_NCCL2 FALSE)
- endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
+ if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
+ set(HYDROGEN_HAVE_NCCL2 TRUE)
+ message(STATUS "Aluminum detected with NCCL2 backend support.")
+ else ()
+ set(HYDROGEN_HAVE_NCCL2 FALSE)
+ endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
- if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
- set(HYDROGEN_HAVE_AL_HOST_XFER TRUE)
- message(STATUS "Aluminum detected with HostTransfer backend support.")
- else ()
- set(HYDROGEN_HAVE_AL_HOST_XFER FALSE)
- endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
+ if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
+ set(HYDROGEN_HAVE_AL_HOST_XFER TRUE)
+ message(STATUS "Aluminum detected with HostTransfer backend support.")
+ else ()
+ set(HYDROGEN_HAVE_AL_HOST_XFER FALSE)
+ endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
- if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
- set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE)
- message(STATUS "Aluminum detected with MPI-CUDA backend support.")
- else ()
- set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE)
- endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+ if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+ set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE)
+ message(STATUS "Aluminum detected with MPI-CUDA backend support.")
+ else ()
+ set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE)
+ endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+
+ # Check for in-place SendRecv.
+ if (ALUMINUM_VERSION VERSION_GREATER_EQUAL "1.3.0")
+ set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV TRUE)
else ()
- message(FATAL_ERROR "Aluminum support requested but not found. "
- "Please set Aluminum_DIR to point to the installation prefix "
- "for Aluminum.")
- endif (Aluminum_FOUND)
-endif (Hydrogen_ENABLE_ALUMINUM)
+ set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV FALSE)
+ endif ()
+
+ if (HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV)
+ message(STATUS "Aluminum detected with in-place SendRecv support.")
+ else ()
+ message(STATUS "Aluminum detected WITHOUT in-place SendRecv support.")
+ endif ()
+
+else ()
+
+ message(FATAL_ERROR "Aluminum support required but not found. "
+ "Please set Aluminum_ROOT to its installation prefix or add "
+ "the installation prefix to CMAKE_PREFIX_PATH.")
+
+endif (Aluminum_FOUND)
# Sets up EL_RESTRICT and EL_HAVE_PRETTY_FUNCTION
include(detect/CXX)
@@ -484,7 +511,7 @@ add_subdirectory(src)
# docs (which has the advantage that preprocessing will take
# "{,hydrogen_}config.h" into consideration).
configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/config.h.in"
- "${PROJECT_BINARY_DIR}/include/El/config.h")
+ "${PROJECT_BINARY_DIR}/include/El/config.h" ESCAPE_QUOTES)
configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/hydrogen_config.h.in"
"${PROJECT_BINARY_DIR}/include/El/hydrogen_config.h")
configure_file("${PROJECT_SOURCE_DIR}/doxy/Doxyfile.in"
diff --git a/ElementalREADME.md b/ElementalREADME.md
index 570370b917..1e9dae8091 100644
--- a/ElementalREADME.md
+++ b/ElementalREADME.md
@@ -1,10 +1,3 @@
-
-
-
-
-[](https://travis-ci.org/elemental/Elemental)
-[](https://gitter.im/elemental/chat?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-
**Elemental** is a modern C++ library for distributed-memory dense and
sparse-direct linear algebra, conic optimization, and lattice reduction.
The library was initially released in
@@ -15,7 +8,7 @@ was originally released during a project on [Parallel Sweeping Preconditioners](
### Documentation
-The (now outdated) [documentation for Elemental](http://libelemental.org/documentation) is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html)
+The (now outdated) documentation for Elemental is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html)
### Unique features
diff --git a/cmake/configure_files/HydrogenConfig.cmake.in b/cmake/configure_files/HydrogenConfig.cmake.in
index 8127f803ec..add71cc1b5 100644
--- a/cmake/configure_files/HydrogenConfig.cmake.in
+++ b/cmake/configure_files/HydrogenConfig.cmake.in
@@ -92,6 +92,7 @@ if (_HYDROGEN_HAVE_CUDA)
endif ()
set(_HYDROGEN_HAVE_ROCM @HYDROGEN_HAVE_ROCM@)
+set(_HYDROGEN_HAVE_ROCTRACER @HYDROGEN_HAVE_ROCTRACER@)
if (_HYDROGEN_HAVE_ROCM)
find_dependency(hip CONFIG)
@@ -106,6 +107,14 @@ if (_HYDROGEN_HAVE_ROCM)
find_dependency(rocblas CONFIG)
find_dependency(rocsolver CONFIG)
find_dependency(rocthrust CONFIG)
+
+ if (_HYDROGEN_HAVE_ROCTRACER)
+ find_dependency(Roctracer MODULE COMPONENTS roctx)
+ endif ()
+
+ include(HydrogenCleanupHIPTargets)
+ h_clean_hip_targets()
+
set(HYDROGEN_HAVE_ROCM TRUE)
endif (_HYDROGEN_HAVE_ROCM)
@@ -114,7 +123,7 @@ if (HYDROGEN_HAVE_HALF)
find_dependency(HALF)
endif ()
-if (_HYDROGEN_HAVE_CUDA)
+if (_HYDROGEN_HAVE_CUDA OR _HYDROGEN_HAVE_ROCM)
set(HYDROGEN_GPU_USE_FP16 @HYDROGEN_GPU_USE_FP16@)
endif ()
diff --git a/cmake/configure_files/hydrogen_config.h.in b/cmake/configure_files/hydrogen_config.h.in
index a3f7434d2d..02c7993b92 100644
--- a/cmake/configure_files/hydrogen_config.h.in
+++ b/cmake/configure_files/hydrogen_config.h.in
@@ -40,6 +40,7 @@
// ROCm stuff
#cmakedefine HYDROGEN_HAVE_ROCM
+#cmakedefine HYDROGEN_HAVE_ROCTRACER
// General GPU stuff
#cmakedefine HYDROGEN_HAVE_CUB
@@ -51,6 +52,7 @@
#cmakedefine HYDROGEN_HAVE_NCCL2
#cmakedefine HYDROGEN_HAVE_AL_MPI_CUDA
#cmakedefine HYDROGEN_HAVE_AL_HOST_XFER
+#cmakedefine HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV
#cmakedefine HYDROGEN_ENSURE_HOST_MPI_BUFFERS
#cmakedefine HYDROGEN_HAVE_CUDA_AWARE_MPI
diff --git a/cmake/modules/FindRoctracer.cmake b/cmake/modules/FindRoctracer.cmake
new file mode 100644
index 0000000000..d209bb1fee
--- /dev/null
+++ b/cmake/modules/FindRoctracer.cmake
@@ -0,0 +1,78 @@
+# Sets the following variables
+#
+# Roctracer_FOUND
+# Roctracer_LIBRARIES
+#
+# Defines the following imported target:
+#
+# roctracer::roctracer
+# roctracer::roctracer_api
+# roctracer::roctx_api
+#
+
+set(_supported_components roctracer roctx)
+if (NOT Roctracer_FIND_COMPONENTS)
+ set(Roctracer_FIND_COMPONENTS ${_supported_components})
+endif ()
+
+foreach (comp IN LISTS Roctracer_FIND_COMPONENTS)
+ if (NOT ${comp} IN_LIST _supported_components)
+ message(FATAL_ERROR
+ "Cannot specify component \"${comp}\" for package Roctracer. "
+ "Supported components are: ${_supported_components}.")
+ endif ()
+
+ set(_header_name "${comp}.h")
+ set(_lib_name "${comp}64")
+
+ find_path(${comp}_INCLUDE_PATH ${_header_name}
+ HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
+ PATH_SUFFIXES include
+ DOC "The ${comp} include directory for roctracer."
+ NO_DEFAULT_PATH)
+ find_path(${comp}_INCLUDE_PATH ${_header_name}
+ HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer
+ DOC "The ${comp} include directory for roctracer."
+ NO_DEFAULT_PATH)
+ find_path(${comp}_INCLUDE_PATH ${_header_name})
+
+ find_library(${comp}_LIBRARY ${_lib_name}
+ HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
+ HINTS ${ROCM_PATH} $ENV{ROCM_PATH}
+ PATH_SUFFIXES lib64 lib
+ DOC "The ${comp} library for roctracer."
+ NO_DEFAULT_PATH)
+ find_library(${comp}_LIBRARY ${_lib_name})
+
+ if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH)
+ set(Roctracer_${comp}_FOUND TRUE)
+
+ if (NOT TARGET roctracer::${comp}_api)
+ add_library(roctracer::${comp}_api INTERFACE IMPORTED)
+ endif ()
+ target_link_libraries(roctracer::${comp}_api INTERFACE
+ "${${comp}_LIBRARY}")
+ target_include_directories(roctracer::${comp}_api INTERFACE
+ "${${comp}_INCLUDE_PATH}")
+
+ mark_as_advanced(${comp}_LIBRARY)
+ mark_as_advanced(${comp}_INCLUDE_PATH)
+
+ list(APPEND _imported_libraries roctracer::${comp}_api)
+ else ()
+ set(Roctracer_${comp}_FOUND FALSE)
+ endif ()
+endforeach ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS)
+
+if (Roctracer_FOUND)
+ if (NOT TARGET roctracer::roctracer)
+ add_library(roctracer::roctracer INTERFACE IMPORTED)
+ endif ()
+ foreach (lib IN LISTS _imported_libraries)
+ target_link_libraries(roctracer::roctracer INTERFACE ${lib})
+ endforeach ()
+ set(Roctracer_LIBRARIES roctracer::roctracer)
+endif (Roctracer_FOUND)
diff --git a/cmake/modules/HydrogenCleanupHIPTargets.cmake b/cmake/modules/HydrogenCleanupHIPTargets.cmake
new file mode 100644
index 0000000000..90d12b2590
--- /dev/null
+++ b/cmake/modules/HydrogenCleanupHIPTargets.cmake
@@ -0,0 +1,54 @@
+# This just finds some stuff correctly and cleans up the HIP targets.
+macro(h_clean_hip_targets)
+ set(HIP_CLANG_ROOT "$ENV{ROCM_PATH}/llvm")
+
+ file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS
+ "${HIP_CLANG_ROOT}/lib/clang/*/include")
+ find_path(HIP_CLANG_INCLUDE_PATH stddef.h
+ HINTS "${HIP_CLANG_INCLUDE_SEARCH_PATHS}"
+ NO_DEFAULT_PATH)
+
+ if (HIP_CLANG_INCLUDE_PATH)
+ message(STATUS "Found clang include path: ${HIP_CLANG_INCLUDE_PATH}")
+ else ()
+ message(WARNING
+ "Could not find clang include path. "
+ "Using whatever is in the hip IMPORTED targets")
+ endif ()
+
+ file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS
+ "${HIP_CLANG_ROOT}/lib/clang/*/lib/*")
+ find_library(ACTUAL_CLANGRT_BUILTINS clangrt-builtins
+ NAMES
+ clang_rt.builtins
+ clang_rt.builtins-x86_64
+ PATHS
+ "${HIP_CLANGRT_LIB_SEARCH_PATHS}")
+
+ if (ACTUAL_CLANGRT_BUILTINS)
+ message(STATUS "Found clangrt builtins: ${ACTUAL_CLANGRT_BUILTINS}")
+ else ()
+ message(WARNING
+ "Could not find clangrt builtins. "
+ "Using whatever is in the hip IMPORTED targets")
+ endif ()
+
+ get_target_property(_HIP_HOST_LIBS hip::host INTERFACE_LINK_LIBRARIES)
+ get_target_property(_HIP_DEVICE_LIBS hip::device INTERFACE_LINK_LIBRARIES)
+
+ string(REPLACE
+ "CLANGRT_BUILTINS-NOTFOUND"
+ "${ACTUAL_CLANGRT_BUILTINS}"
+ _NEW_HIP_HOST_LIBS
+ "${_HIP_HOST_LIBS}")
+ string(REPLACE
+ "CLANGRT_BUILTINS-NOTFOUND"
+ "${ACTUAL_CLANGRT_BUILTINS}"
+ _NEW_HIP_DEVICE_LIBS
+ "${_HIP_DEVICE_LIBS}")
+
+ set_property(TARGET hip::host
+ PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_HOST_LIBS})
+ set_property(TARGET hip::device
+ PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_DEVICE_LIBS})
+endmacro()
diff --git a/include/El/blas_like/level1/Copy/Translate.hpp b/include/El/blas_like/level1/Copy/Translate.hpp
index 55579afb13..db3f68718e 100644
--- a/include/El/blas_like/level1/Copy/Translate.hpp
+++ b/include/El/blas_like/level1/Copy/Translate.hpp
@@ -67,9 +67,15 @@ void Translate(
const Int maxWidth = MaxLength(width, rowStride);
const Int pkgSize = mpi::Pad(maxHeight*maxWidth);
- simple_buffer buffer;
- if(crossRank == root || crossRank == B.Root())
- buffer.allocate(pkgSize);
+ // When crossRank == root, this will be a SEND buffer
+ // (+SendRecv when !aligned), and it should use
+ // syncInfoA. When crossRank == B.Root(), this will be a RECV
+ // buffer, and it should use syncInfoB. Otherwise, this isn't
+ // used.
+ simple_buffer buffer =
+ (crossRank == root || crossRank == B.Root()
+ ? simple_buffer(pkgSize, syncInfoA)
+ : simple_buffer{});
const Int colAlignB = B.ColAlign();
const Int rowAlignB = B.RowAlign();
diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
index 4783359c9c..77cf807c28 100644
--- a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
+++ b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
@@ -10,6 +10,8 @@
#define EL_BLAS_COPY_TRANSLATEBETWEENGRIDS_HPP
#include "core/environment/decl.hpp"
+#include
+
namespace El
{
namespace copy
@@ -3562,7 +3564,6 @@ void TranslateBetweenGridsAsync
const Int mLocA = A.LocalHeight();
const Int nLocA = A.LocalWidth();
-
mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
mpi::Group owningGroupA = A.Grid().OwningGroup();
@@ -3856,7 +3857,8 @@ void TranslateBetweenGrids(
EL_DEBUG_CSE;
/* Overview
-
+ We broadcast the size of A to all the ranks in B to make sure that
+ all ranks in B subgrid has the correct size of A.
Since we are using blocking communication, some care is required
to avoid deadlocks. Let's start with a naive algorithm for
[STAR,VC] matrices and optimize it in steps:
@@ -3883,21 +3885,55 @@ void TranslateBetweenGrids(
*/
// Matrix dimensions
- const Int m = A.Height();
- const Int n = A.Width();
+ Int m = A.Height();
+ Int n = A.Width();
+ Int strideA = A.RowStride();
+ Int ALDim = A.LDim();
+
+ mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
+
+ bool const inAGrid = A.Participating();
+ bool const inBGrid = B.Participating();
+
+ Int recvMetaData[4];
+ Int metaData[4];
+ if(inAGrid)
+ {
+ metaData[0] = m;
+ metaData[1] = n;
+ metaData[2] = strideA;
+ metaData[3] = ALDim;
+ }
+ else
+ {
+ metaData[0] = 0;
+ metaData[1] = 0;
+ metaData[2] = 0;
+ metaData[3] = 0;
+ }
+ const std::vector sendMetaData (metaData, metaData + 4);
+ mpi::AllReduce(sendMetaData.data(),
+ recvMetaData,
+ 4,
+ mpi::MAX,
+ viewingCommB,
+ SyncInfo{});
+
+ m = recvMetaData[0];
+ n = recvMetaData[1];
+ strideA = recvMetaData[2];
+ ALDim =recvMetaData[3];
+
B.Resize(m, n);
const Int nLocA = A.LocalWidth();
const Int nLocB = B.LocalWidth();
// Return immediately if there is no local data
- const bool inAGrid = A.Participating();
- const bool inBGrid = B.Participating();
if (!inAGrid && !inBGrid) {
return;
}
// Compute the number of messages to send/recv
- const Int strideA = A.RowStride();
const Int strideB = B.RowStride();
const Int strideGCD = GCD(strideA, strideB);
const Int numSends = Min(strideB/strideGCD, nLocA);
@@ -3906,14 +3942,24 @@ void TranslateBetweenGrids(
// Synchronize compute streams
SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix());
SyncInfo syncInfoB = SyncInfoFromMatrix(B.Matrix());
- auto syncHelper = MakeMultiSync(syncInfoB, syncInfoA);
- const SyncInfo& syncInfo = syncHelper;
+
+ std::optional> maybeMultiSync;
+ if (inAGrid && inBGrid)
+ maybeMultiSync.emplace(syncInfoB, syncInfoA);
+
+ SyncInfo const syncInfo =
+ (maybeMultiSync.has_value()
+ ? *maybeMultiSync
+ : (inAGrid ? syncInfoA : syncInfoB));
+
+ // Collective!
+ mpi::EnsureComm(viewingCommB, syncInfo);
+ mpi::EnsureComm(viewingCommB, syncInfo);
// Translate the ranks from A's VC communicator to B's viewing so
// that we can match send/recv communicators. Since A's VC
// communicator is not necessarily defined on every process, we
// instead work with A's owning group.
- mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
mpi::Group owningGroupA = A.Grid().OwningGroup();
const int sizeA = A.Grid().Size();
vector viewingRanksA(sizeA), owningRanksA(sizeA);
@@ -3976,7 +4022,7 @@ void TranslateBetweenGrids(
// Copy data locally
copy::util::InterleaveMatrix(
m, messageWidth,
- A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(),
+ A.LockedBuffer(0,jLocA), 1, numSends*ALDim,
B.Buffer(0,jLocB), 1, numRecvs*B.LDim(),
syncInfo);
}
@@ -3984,7 +4030,7 @@ void TranslateBetweenGrids(
// Send data to other rank
copy::util::InterleaveMatrix(
m, messageWidth,
- A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(),
+ A.LockedBuffer(0,jLocA), 1, numSends*ALDim,
messageBuf.data(), 1, m,
syncInfo);
mpi::Send(
diff --git a/include/El/core.hpp b/include/El/core.hpp
index 173e0e6247..4009f55084 100644
--- a/include/El/core.hpp
+++ b/include/El/core.hpp
@@ -80,14 +80,10 @@ using hydrogen::gpu_half_type;
#endif // HYDROGEN_GPU_USE_FP16
}
-#if __cplusplus >= 201402L
-#define H_DEPRECATED(msg) [[deprecated(msg)]]
-#elif defined(__GNUC__)
-// This ^ isn't perfect -- many non-GCC compilers define __GNUC__.
-#define H_DEPRECATED(msg) __attribute__ ((deprecated(msg)))
-#else
+// NOTE: These have not been as inspirational as I had hoped. I'm
+// leaving the notes but preprocessing them away so the compile
+// warnings stop.
#define H_DEPRECATED(msg)
-#endif
#define EL_UNUSED(expr) (void)(expr)
diff --git a/include/El/core/Element/decl.hpp b/include/El/core/Element/decl.hpp
index 72d30b19c2..7cf87f7415 100644
--- a/include/El/core/Element/decl.hpp
+++ b/include/El/core/Element/decl.hpp
@@ -9,6 +9,7 @@
#ifndef EL_ELEMENT_DECL_HPP
#define EL_ELEMENT_DECL_HPP
+#include
#include
#include
@@ -262,7 +263,7 @@ template(alpha); }
};
template
@@ -272,6 +273,37 @@ struct Caster,void>
{ return Complex( T(RealPart(alpha)), T(ImagPart(alpha)) ); }
};
+#if defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16
+#if defined HYDROGEN_HAVE_HALF
+template <>
+struct Caster
+{
+ static cpu_half_type Cast(gpu_half_type const& in)
+ {
+ return cpu_half_type(static_cast(in));
+ }
+};// Caster
+#endif // defined HYDROGEN_HAVE_HALF
+
+template <>
+struct Caster
+{
+ static double Cast(gpu_half_type const& in)
+ {
+ return static_cast(in);
+ }
+};// Caster
+
+template <>
+struct Caster>
+{
+ static Complex Cast(gpu_half_type const& alpha)
+ {
+ return Complex(static_cast(alpha), gpu_half_type(0));
+ }
+};
+#endif // defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16
+
// Set the real/imaginary part of an element
// -----------------------------------------
template
#include
+#include
#include
#include
#include
@@ -21,6 +22,7 @@ namespace El
{
namespace details
{
+
template
void ThrowRuntimeError(Args&&... args)
{
@@ -29,6 +31,25 @@ void ThrowRuntimeError(Args&&... args)
(void) dummy;
throw std::runtime_error(oss.str());
}
+
+/** @brief Returns true iff env(H_MEMPOOL_DEBUG) is truthy.
+ *
+ * Truthy values are non-empty strings that start with any character
+ * other than '0' (ASCII "zero"). So "true", "false", "1", "13",
+ * "-q", ":)", and " " are all truthy, while "", "0true", "0false",
+ * "0000", "0123", and "0:)" are all falsey.
+ */
+bool debug_mempool() noexcept;
+
+/** @brief Check env(H_MEMPOOL_BIN_GROWTH). Default 1.6f. */
+float default_mempool_bin_growth() noexcept;
+
+/** @brief Check env(H_MEMPOOL_MIN_BIN). Default 1UL. */
+size_t default_mempool_min_bin() noexcept;
+
+/** @brief Check env(H_MEMPOOL_MAX_BIN). Default (1<<26). */
+size_t default_mempool_max_bin() noexcept;
+
} // namespace details
/** Simple caching memory pool.
@@ -50,10 +71,13 @@ class MemoryPool
* @param bin_growth Controls how fast bins grow.
* @param min_bin_size Smallest bin size (in bytes).
* @param max_bin_size Largest bin size (in bytes).
+ * @param debug Print debugging messages.
*/
- MemoryPool(float bin_growth = 1.6,
- size_t min_bin_size = 1,
- size_t max_bin_size = 1<<26)
+ MemoryPool(float const bin_growth = details::default_mempool_bin_growth(),
+ size_t const min_bin_size = details::default_mempool_min_bin(),
+ size_t const max_bin_size = details::default_mempool_max_bin(),
+ bool const debug = details::debug_mempool())
+ : debug_{debug}
{
std::set bin_sizes;
for (float bin_size = min_bin_size;
@@ -74,23 +98,47 @@ class MemoryPool
// Set up bins.
for (size_t i = 0; i < bin_sizes_.size(); ++i)
free_data_.emplace_back();
+ if (debug_)
+ {
+ std::clog << "==Mempool(" << this << ")== "
+ << "Created memory pool ("
+ << "pinned=" << (Pinned ? "t" : "f")
+ << ", growth=" << bin_growth
+ << ", min bin=" << bin_sizes_.front()
+ << ", max bin=" << bin_sizes_.back() << ")\n"
+ << "==Mempool(" << this << ")== "
+ << "Bin sizes: [";
+ for (auto const& b : bin_sizes_)
+ std::clog << " " << b;
+ std::clog << " ]" << std::endl;
+ }
}
~MemoryPool()
{
FreeAllUnused();
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << alloc_to_bin_.size()
+ << " dangling allocations\n"
+ << "==Mempool(" << this << ")== "
+ << "Destroyed memory pool"
+ << std::endl;
}
/** Return memory of size bytes. */
void* Allocate(size_t size)
{
- size_t bin = get_bin(size);
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Requesting allocation of "
+ << size << " bytes."
+ << std::endl;
+ size_t const bin = get_bin(size);
void* mem = nullptr;
std::lock_guard lock(mutex_);
// size is too large, this will not be cached.
if (bin == INVALID_BIN)
- {
mem = do_allocation(size);
- }
else
{
// Check if there is available memory in our bin.
@@ -98,6 +146,10 @@ class MemoryPool
{
mem = free_data_[bin].back();
free_data_[bin].pop_back();
+ --num_cached_blks_;
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Reusing cached pointer " << mem << "\n";
}
else
{
@@ -105,41 +157,45 @@ class MemoryPool
}
}
alloc_to_bin_[mem] = bin;
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << alloc_to_bin_.size()
+ << " blocks allocated; "
+ << num_cached_blks_
+ << " blocks cached"
+ << std::endl;
+
return mem;
}
/** Release previously allocated memory. */
void Free(void* ptr)
{
std::lock_guard lock(mutex_);
- auto iter = alloc_to_bin_.find(ptr);
+ auto const iter = alloc_to_bin_.find(ptr);
if (iter == alloc_to_bin_.end())
- {
details::ThrowRuntimeError("Tried to free unknown ptr");
- }
+
+ size_t const& bin = iter->second;
+ alloc_to_bin_.erase(iter);
+ if (bin == INVALID_BIN)
+ do_free(ptr);
else
{
- size_t bin = iter->second;
- alloc_to_bin_.erase(iter);
- if (bin == INVALID_BIN)
- {
- do_free(ptr);
- }
- else
- {
- // Cache the pointer for reuse.
- free_data_[bin].push_back(ptr);
- }
+ // Cache the pointer for reuse.
+ free_data_[bin].push_back(ptr);
+ ++num_cached_blks_;
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Cached pointer " << ptr << "\n";
}
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << alloc_to_bin_.size()
+ << " blocks allocated; "
+ << num_cached_blks_
+ << " blocks cached"
+ << std::endl;
}
- /** Release all unused memory. */
- void FreeAllUnused()
- {
- std::lock_guard lock(mutex_);
- for (size_t bin = 0; bin < bin_sizes_.size(); ++bin)
- for (auto&& ptr : free_data_[bin])
- do_free(ptr);
- }
-
private:
/** Index of an invalid bin. */
@@ -158,6 +214,25 @@ class MemoryPool
/** Map used pointers to the associated bin index. */
std::unordered_map alloc_to_bin_;
+ /** Track the total number of available blocks. */
+ size_t num_cached_blks_;
+
+ /** Print debugging messages throughout lifetime. */
+ bool debug_;
+
+ /** Release all unused memory. */
+ void FreeAllUnused()
+ {
+ std::lock_guard lock(mutex_);
+ for (size_t bin = 0; bin < bin_sizes_.size(); ++bin)
+ {
+ for (auto&& ptr : free_data_[bin])
+ do_free(ptr);
+ std::vector{}.swap(free_data_[bin]);
+ }
+ num_cached_blks_ = 0ul;
+ }
+
/** Allocate size bytes. */
inline void* do_allocation(size_t size);
/** Free ptr. */
@@ -179,7 +254,7 @@ class MemoryPool
#ifdef HYDROGEN_HAVE_CUDA
template <>
-inline void* MemoryPool::do_allocation(size_t bytes)
+inline void* MemoryPool::do_allocation(size_t const bytes)
{
void* ptr;
auto error = cudaMallocHost(&ptr, bytes);
@@ -189,11 +264,15 @@ inline void* MemoryPool::do_allocation(size_t bytes)
"Failed to allocate CUDA pinned memory with message: ",
"\"", cudaGetErrorString(error), "\"");
}
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Allocated pinned " << bytes << " bytes at " << ptr
+ << std::endl;
return ptr;
}
template<>
-inline void MemoryPool::do_free(void* ptr)
+inline void MemoryPool::do_free(void* const ptr)
{
auto error = cudaFreeHost(ptr);
if (error != cudaSuccess)
@@ -202,49 +281,64 @@ inline void MemoryPool::do_free(void* ptr)
"Failed to free CUDA pinned memory with message: ",
"\"", cudaGetErrorString(error), "\"");
}
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Freed pinned ptr " << ptr
+ << std::endl;
}
#elif defined(HYDROGEN_HAVE_ROCM)
template <>
-inline void* MemoryPool::do_allocation(size_t bytes)
+inline void* MemoryPool::do_allocation(size_t const bytes)
{
void* ptr;
auto error = hipHostMalloc(&ptr, bytes);
if (error != hipSuccess)
- {
details::ThrowRuntimeError(
"Failed to allocate HIP pinned memory with message: ",
"\"", hipGetErrorString(error), "\"");
- }
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Allocated pinned " << bytes << " bytes at " << ptr
+ << std::endl;
return ptr;
}
template<>
-inline void MemoryPool::do_free(void* ptr)
+inline void MemoryPool::do_free(void* const ptr)
{
auto error = hipHostFree(ptr);
if (error != hipSuccess)
- {
details::ThrowRuntimeError(
"Failed to free HIP pinned memory with message: ",
"\"", hipGetErrorString(error), "\"");
- }
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Freed pinned ptr " << ptr
+ << std::endl;
}
#endif // HYDROGEN_HAVE_CUDA
template <>
-inline void* MemoryPool::do_allocation(size_t bytes) {
+inline void* MemoryPool::do_allocation(size_t const bytes)
+{
void* ptr = std::malloc(bytes);
if (ptr == nullptr)
- {
details::ThrowRuntimeError("Failed to allocate memory");
- }
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Allocated " << bytes << " bytes at " << ptr
+ << std::endl;
return ptr;
}
template<>
-inline void MemoryPool::do_free(void* ptr)
+inline void MemoryPool::do_free(void* const ptr)
{
- return std::free(ptr);
+ std::free(ptr);
+ if (debug_)
+ std::clog << "==Mempool(" << this << ")== "
+ << "Freed ptr " << ptr
+ << std::endl;
}
#ifdef HYDROGEN_HAVE_GPU
diff --git a/include/El/core/Profiling.hpp b/include/El/core/Profiling.hpp
index a09199ff17..45d2a7181b 100644
--- a/include/El/core/Profiling.hpp
+++ b/include/El/core/Profiling.hpp
@@ -19,6 +19,10 @@ void DisableVTune() noexcept;
void EnableNVProf() noexcept;
void DisableNVProf() noexcept;
+// These are no-ops if roctracer is not enabled at compile time
+void EnableROCTX() noexcept;
+void DisableROCTX() noexcept;
+
/** \brief A selection of colors to use with the profiling interface.
*
* It seems unlikely that a user will ever need to access these by
diff --git a/include/El/core/environment/decl.hpp b/include/El/core/environment/decl.hpp
index 8426ee1f9d..b9367fde42 100644
--- a/include/El/core/environment/decl.hpp
+++ b/include/El/core/environment/decl.hpp
@@ -35,7 +35,10 @@ using std::ostream;
using std::ostringstream;
using std::exception;
-using std::uncaught_exception;
+inline bool uncaught_exception() noexcept
+{
+ return std::uncaught_exceptions() > 0;
+}
void PrintVersion( ostream& os=cout );
void PrintConfig( ostream& os=cout );
diff --git a/include/El/core/imports/aluminum.hpp b/include/El/core/imports/aluminum.hpp
index 6cc8391727..67da59d507 100644
--- a/include/El/core/imports/aluminum.hpp
+++ b/include/El/core/imports/aluminum.hpp
@@ -31,7 +31,11 @@ enum class Collective
REDUCE,
REDUCESCATTER,
SCATTER,
- SENDRECV
+
+ // Not collectives, but what can you do
+ SENDRECV,
+ SEND,
+ RECV,
};// enum class Collective
#ifndef HYDROGEN_HAVE_ALUMINUM
@@ -115,6 +119,8 @@ ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::MPIBackend);
ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::MPIBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::MPIBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::MPIBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::MPIBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::MPIBackend);
#ifdef HYDROGEN_HAVE_NCCL2
// NCCL backend supports these
@@ -126,7 +132,9 @@ ADD_ALUMINUM_COLLECTIVE( Collective::GATHER, Al::NCCLBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::NCCLBackend);
ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::NCCLBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::NCCLBackend);
-//ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::NCCLBackend);
#endif // HYDROGEN_HAVE_NCCL2
#ifdef HYDROGEN_HAVE_AL_HOST_XFER
@@ -140,6 +148,8 @@ ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::HostTransferBackend);
ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::HostTransferBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::HostTransferBackend);
ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::HostTransferBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::HostTransferBackend);
+ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::HostTransferBackend);
#endif // HYDROGEN_HAVE_AL_HOST_XFER
template
@@ -349,10 +359,26 @@ struct SyncInfoManager
};
#endif // HYDROGEN_HAVE_GPU
+inline bool use_separate_comm_stream() noexcept
+{
+ char const* const env = std::getenv("H_USE_SEPARATE_COMM_STREAM");
+ return (env && std::strlen(env) && env[0] != '0');
+}
+
template
SyncInfo()> const& BackendSyncInfo()
{
constexpr Device D = DeviceForBackend();
+#ifdef HYDROGEN_HAVE_GPU
+ if constexpr (D == El::Device::GPU)
+ {
+ static bool const use_separate_stream = use_separate_comm_stream();
+ if (!use_separate_stream)
+ {
+ return El::gpu::DefaultSyncInfo();
+ }
+ }
+#endif // HYDROGEN_HAVE_GPU
static SyncInfoManager si_mgr_(BackendT::Name());
return si_mgr_.si_;
}
diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp
index d51c4d1384..a2527fd27c 100644
--- a/include/El/core/imports/mpi.hpp
+++ b/include/El/core/imports/mpi.hpp
@@ -173,6 +173,15 @@ extern template struct Types; // Avoid conflict with Int
#undef PROTO
#endif // !defined H_INSTANTIATING_MPI_TYPES_STRUCT
+#ifdef HYDROGEN_HAVE_HALF
+extern template struct Types;
+extern template struct Types>;
+#endif
+#ifdef HYDROGEN_GPU_USE_FP16
+extern template struct Types;
+extern template struct Types>;
+#endif
+
template
struct MPIBaseHelper { typedef T value; };
template
diff --git a/include/hydrogen/PoolAllocator.hpp b/include/hydrogen/PoolAllocator.hpp
new file mode 100644
index 0000000000..b251a7171f
--- /dev/null
+++ b/include/hydrogen/PoolAllocator.hpp
@@ -0,0 +1,1145 @@
+// See LICENSE for Hydrogen license. Original license for CUB follows:
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Extended asynchronous pooling allocator with exponential, multiplicative, and
+ * user-specified bin sizes. This allocator is based on CUB's pooling allocator
+ * and can use {cuda,hip}MallocAsync as necessary. It also provides extensive
+ * reporting for allocations, bins, and extraneous memory.
+ ******************************************************************************/
+
+#ifndef HYDROGEN_POOLALLOCATOR_HPP_
+#define HYDROGEN_POOLALLOCATOR_HPP_
+
+#include
+#include
+#include