diff --git a/CMakeLists.txt b/CMakeLists.txt index 219abe707f..ecbe09895e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ endif (__GIT_EXECUTABLE) # This must be set because version tags set(HYDROGEN_VERSION_MAJOR 1) set(HYDROGEN_VERSION_MINOR 5) -set(HYDROGEN_VERSION_PATCH 2) +set(HYDROGEN_VERSION_PATCH 4) set(HYDROGEN_VERSION_MAJOR_MINOR "${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}") set(HYDROGEN_VERSION @@ -101,10 +101,7 @@ if (NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) endif () # Feature-related options - -option(Hydrogen_ENABLE_ALUMINUM - "Enable the Aluminum package for improved device-side communication." - OFF) +include(CMakeDependentOption) option(Hydrogen_ENABLE_CUDA "Search for CUDA support and enable related features if found." @@ -114,6 +111,12 @@ option(Hydrogen_ENABLE_ROCM "Search for ROCm/HIP support and enable related features if found." OFF) +cmake_dependent_option(Hydrogen_ENABLE_ROCTRACER + "Search for Roctracer and enable related features if found." + OFF + "Hydrogen_ENABLE_ROCM" + OFF) + if (Hydrogen_ENABLE_ROCM AND Hydrogen_ENABLE_CUDA) message(FATAL_ERROR "ROCm and CUDA code paths are mutually exclusive. " @@ -313,10 +316,6 @@ if (Hydrogen_ENABLE_CUDA) set(HYDROGEN_GPU_USE_TENSOR_OP_MATH TRUE) endif () - if (Hydrogen_ENABLE_GPU_FP16) - set(HYDROGEN_GPU_USE_FP16 TRUE) - endif () - if (Hydrogen_ENABLE_CUB) if (CUDAToolkit_VERSION_MAJOR LESS 11) find_package(CUB MODULE REQUIRED) @@ -358,74 +357,102 @@ if (Hydrogen_ENABLE_ROCM) find_package(rocsolver CONFIG REQUIRED) find_package(rocthrust CONFIG REQUIRED) + if (Hydrogen_ENABLE_ROCTRACER) + find_package(Roctracer MODULE COMPONENTS roctx) + set(HYDROGEN_HAVE_ROCTRACER ${Roctracer_FOUND}) + endif () + + include(HydrogenCleanupHIPTargets) + h_clean_hip_targets() + set(HYDROGEN_HAVE_ROCM TRUE) message(STATUS "Found ROCm/HIP toolchain. Using HIP/ROCm.") set(H_ROCM_CXX_LIBS hip::host - hip::hipcub roc::rocblas roc::rocsolver - roc::rocthrust) - + roc::rocthrust + ${Roctracer_LIBRARIES}) + if (HYDROGEN_HAVE_CUB) + list(APPEND H_ROCM_CXX_LIBS hip::hipcub) + endif () set(H_ROCM_HIP_LIBS hip::device) endif (Hydrogen_ENABLE_ROCM) if (HYDROGEN_HAVE_CUDA OR HYDROGEN_HAVE_ROCM) set(HYDROGEN_HAVE_GPU TRUE) + + if (Hydrogen_ENABLE_GPU_FP16) + set(HYDROGEN_GPU_USE_FP16 TRUE) + endif () endif () -if (Hydrogen_ENABLE_ALUMINUM) - find_package(Aluminum 1.0.0 CONFIG QUIET) +find_package(Aluminum 1.0.0 CONFIG QUIET) +if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE) + message(STATUS + "A candidate Aluminum > v1.0.0 was found, but was not selected:") + message(STATUS + " ${Aluminum_NOT_FOUND_MESSAGE}") +endif () +# Try again, since we're technically ok with >v0.7.0 +if (NOT Aluminum_FOUND) + find_package(Aluminum 0.7.0 CONFIG QUIET) if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE) message(STATUS - "A candidate Aluminum > v1.0.0 was found, but was not selected:") + "A candidate Aluminum > v0.7.0 was found, but was not selected:") message(STATUS " ${Aluminum_NOT_FOUND_MESSAGE}") endif () - # Try again, since we're technically ok with >v0.7.0 - if (NOT Aluminum_FOUND) - find_package(Aluminum 0.7.0 CONFIG QUIET) - if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE) - message(STATUS - "A candidate Aluminum > v0.7.0 was found, but was not selected:") - message(STATUS - " ${Aluminum_NOT_FOUND_MESSAGE}") - endif () - endif () +endif () - if (Aluminum_FOUND) - set(HYDROGEN_HAVE_ALUMINUM TRUE) - message(STATUS - "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}") +if (Aluminum_FOUND) + set(HYDROGEN_HAVE_ALUMINUM TRUE) + message(STATUS + "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}") - if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) - set(HYDROGEN_HAVE_NCCL2 TRUE) - message(STATUS "Aluminum detected with NCCL2 backend support.") - else () - set(HYDROGEN_HAVE_NCCL2 FALSE) - endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) + if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) + set(HYDROGEN_HAVE_NCCL2 TRUE) + message(STATUS "Aluminum detected with NCCL2 backend support.") + else () + set(HYDROGEN_HAVE_NCCL2 FALSE) + endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL) - if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER) - set(HYDROGEN_HAVE_AL_HOST_XFER TRUE) - message(STATUS "Aluminum detected with HostTransfer backend support.") - else () - set(HYDROGEN_HAVE_AL_HOST_XFER FALSE) - endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER) + if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER) + set(HYDROGEN_HAVE_AL_HOST_XFER TRUE) + message(STATUS "Aluminum detected with HostTransfer backend support.") + else () + set(HYDROGEN_HAVE_AL_HOST_XFER FALSE) + endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER) - if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) - set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE) - message(STATUS "Aluminum detected with MPI-CUDA backend support.") - else () - set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE) - endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) + if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) + set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE) + message(STATUS "Aluminum detected with MPI-CUDA backend support.") + else () + set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE) + endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA) + + # Check for in-place SendRecv. + if (ALUMINUM_VERSION VERSION_GREATER_EQUAL "1.3.0") + set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV TRUE) else () - message(FATAL_ERROR "Aluminum support requested but not found. " - "Please set Aluminum_DIR to point to the installation prefix " - "for Aluminum.") - endif (Aluminum_FOUND) -endif (Hydrogen_ENABLE_ALUMINUM) + set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV FALSE) + endif () + + if (HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV) + message(STATUS "Aluminum detected with in-place SendRecv support.") + else () + message(STATUS "Aluminum detected WITHOUT in-place SendRecv support.") + endif () + +else () + + message(FATAL_ERROR "Aluminum support required but not found. " + "Please set Aluminum_ROOT to its installation prefix or add " + "the installation prefix to CMAKE_PREFIX_PATH.") + +endif (Aluminum_FOUND) # Sets up EL_RESTRICT and EL_HAVE_PRETTY_FUNCTION include(detect/CXX) @@ -484,7 +511,7 @@ add_subdirectory(src) # docs (which has the advantage that preprocessing will take # "{,hydrogen_}config.h" into consideration). configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/config.h.in" - "${PROJECT_BINARY_DIR}/include/El/config.h") + "${PROJECT_BINARY_DIR}/include/El/config.h" ESCAPE_QUOTES) configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/hydrogen_config.h.in" "${PROJECT_BINARY_DIR}/include/El/hydrogen_config.h") configure_file("${PROJECT_SOURCE_DIR}/doxy/Doxyfile.in" diff --git a/ElementalREADME.md b/ElementalREADME.md index 570370b917..1e9dae8091 100644 --- a/ElementalREADME.md +++ b/ElementalREADME.md @@ -1,10 +1,3 @@ -

- -

- -[![Build Status](https://api.travis-ci.org/elemental/Elemental.svg?branch=master)](https://travis-ci.org/elemental/Elemental) -[![Join the chat at https://gitter.im/elemental/chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/elemental/chat?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) - **Elemental** is a modern C++ library for distributed-memory dense and sparse-direct linear algebra, conic optimization, and lattice reduction. The library was initially released in @@ -15,7 +8,7 @@ was originally released during a project on [Parallel Sweeping Preconditioners]( ### Documentation -The (now outdated) [documentation for Elemental](http://libelemental.org/documentation) is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html) +The (now outdated) documentation for Elemental is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html) ### Unique features diff --git a/cmake/configure_files/HydrogenConfig.cmake.in b/cmake/configure_files/HydrogenConfig.cmake.in index 8127f803ec..add71cc1b5 100644 --- a/cmake/configure_files/HydrogenConfig.cmake.in +++ b/cmake/configure_files/HydrogenConfig.cmake.in @@ -92,6 +92,7 @@ if (_HYDROGEN_HAVE_CUDA) endif () set(_HYDROGEN_HAVE_ROCM @HYDROGEN_HAVE_ROCM@) +set(_HYDROGEN_HAVE_ROCTRACER @HYDROGEN_HAVE_ROCTRACER@) if (_HYDROGEN_HAVE_ROCM) find_dependency(hip CONFIG) @@ -106,6 +107,14 @@ if (_HYDROGEN_HAVE_ROCM) find_dependency(rocblas CONFIG) find_dependency(rocsolver CONFIG) find_dependency(rocthrust CONFIG) + + if (_HYDROGEN_HAVE_ROCTRACER) + find_dependency(Roctracer MODULE COMPONENTS roctx) + endif () + + include(HydrogenCleanupHIPTargets) + h_clean_hip_targets() + set(HYDROGEN_HAVE_ROCM TRUE) endif (_HYDROGEN_HAVE_ROCM) @@ -114,7 +123,7 @@ if (HYDROGEN_HAVE_HALF) find_dependency(HALF) endif () -if (_HYDROGEN_HAVE_CUDA) +if (_HYDROGEN_HAVE_CUDA OR _HYDROGEN_HAVE_ROCM) set(HYDROGEN_GPU_USE_FP16 @HYDROGEN_GPU_USE_FP16@) endif () diff --git a/cmake/configure_files/hydrogen_config.h.in b/cmake/configure_files/hydrogen_config.h.in index a3f7434d2d..02c7993b92 100644 --- a/cmake/configure_files/hydrogen_config.h.in +++ b/cmake/configure_files/hydrogen_config.h.in @@ -40,6 +40,7 @@ // ROCm stuff #cmakedefine HYDROGEN_HAVE_ROCM +#cmakedefine HYDROGEN_HAVE_ROCTRACER // General GPU stuff #cmakedefine HYDROGEN_HAVE_CUB @@ -51,6 +52,7 @@ #cmakedefine HYDROGEN_HAVE_NCCL2 #cmakedefine HYDROGEN_HAVE_AL_MPI_CUDA #cmakedefine HYDROGEN_HAVE_AL_HOST_XFER +#cmakedefine HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV #cmakedefine HYDROGEN_ENSURE_HOST_MPI_BUFFERS #cmakedefine HYDROGEN_HAVE_CUDA_AWARE_MPI diff --git a/cmake/modules/FindRoctracer.cmake b/cmake/modules/FindRoctracer.cmake new file mode 100644 index 0000000000..d209bb1fee --- /dev/null +++ b/cmake/modules/FindRoctracer.cmake @@ -0,0 +1,78 @@ +# Sets the following variables +# +# Roctracer_FOUND +# Roctracer_LIBRARIES +# +# Defines the following imported target: +# +# roctracer::roctracer +# roctracer::roctracer_api +# roctracer::roctx_api +# + +set(_supported_components roctracer roctx) +if (NOT Roctracer_FIND_COMPONENTS) + set(Roctracer_FIND_COMPONENTS ${_supported_components}) +endif () + +foreach (comp IN LISTS Roctracer_FIND_COMPONENTS) + if (NOT ${comp} IN_LIST _supported_components) + message(FATAL_ERROR + "Cannot specify component \"${comp}\" for package Roctracer. " + "Supported components are: ${_supported_components}.") + endif () + + set(_header_name "${comp}.h") + set(_lib_name "${comp}64") + + find_path(${comp}_INCLUDE_PATH ${_header_name} + HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer + PATH_SUFFIXES include + DOC "The ${comp} include directory for roctracer." + NO_DEFAULT_PATH) + find_path(${comp}_INCLUDE_PATH ${_header_name} + HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer + DOC "The ${comp} include directory for roctracer." + NO_DEFAULT_PATH) + find_path(${comp}_INCLUDE_PATH ${_header_name}) + + find_library(${comp}_LIBRARY ${_lib_name} + HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer + HINTS ${ROCM_PATH} $ENV{ROCM_PATH} + PATH_SUFFIXES lib64 lib + DOC "The ${comp} library for roctracer." + NO_DEFAULT_PATH) + find_library(${comp}_LIBRARY ${_lib_name}) + + if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH) + set(Roctracer_${comp}_FOUND TRUE) + + if (NOT TARGET roctracer::${comp}_api) + add_library(roctracer::${comp}_api INTERFACE IMPORTED) + endif () + target_link_libraries(roctracer::${comp}_api INTERFACE + "${${comp}_LIBRARY}") + target_include_directories(roctracer::${comp}_api INTERFACE + "${${comp}_INCLUDE_PATH}") + + mark_as_advanced(${comp}_LIBRARY) + mark_as_advanced(${comp}_INCLUDE_PATH) + + list(APPEND _imported_libraries roctracer::${comp}_api) + else () + set(Roctracer_${comp}_FOUND FALSE) + endif () +endforeach () + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS) + +if (Roctracer_FOUND) + if (NOT TARGET roctracer::roctracer) + add_library(roctracer::roctracer INTERFACE IMPORTED) + endif () + foreach (lib IN LISTS _imported_libraries) + target_link_libraries(roctracer::roctracer INTERFACE ${lib}) + endforeach () + set(Roctracer_LIBRARIES roctracer::roctracer) +endif (Roctracer_FOUND) diff --git a/cmake/modules/HydrogenCleanupHIPTargets.cmake b/cmake/modules/HydrogenCleanupHIPTargets.cmake new file mode 100644 index 0000000000..90d12b2590 --- /dev/null +++ b/cmake/modules/HydrogenCleanupHIPTargets.cmake @@ -0,0 +1,54 @@ +# This just finds some stuff correctly and cleans up the HIP targets. +macro(h_clean_hip_targets) + set(HIP_CLANG_ROOT "$ENV{ROCM_PATH}/llvm") + + file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS + "${HIP_CLANG_ROOT}/lib/clang/*/include") + find_path(HIP_CLANG_INCLUDE_PATH stddef.h + HINTS "${HIP_CLANG_INCLUDE_SEARCH_PATHS}" + NO_DEFAULT_PATH) + + if (HIP_CLANG_INCLUDE_PATH) + message(STATUS "Found clang include path: ${HIP_CLANG_INCLUDE_PATH}") + else () + message(WARNING + "Could not find clang include path. " + "Using whatever is in the hip IMPORTED targets") + endif () + + file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS + "${HIP_CLANG_ROOT}/lib/clang/*/lib/*") + find_library(ACTUAL_CLANGRT_BUILTINS clangrt-builtins + NAMES + clang_rt.builtins + clang_rt.builtins-x86_64 + PATHS + "${HIP_CLANGRT_LIB_SEARCH_PATHS}") + + if (ACTUAL_CLANGRT_BUILTINS) + message(STATUS "Found clangrt builtins: ${ACTUAL_CLANGRT_BUILTINS}") + else () + message(WARNING + "Could not find clangrt builtins. " + "Using whatever is in the hip IMPORTED targets") + endif () + + get_target_property(_HIP_HOST_LIBS hip::host INTERFACE_LINK_LIBRARIES) + get_target_property(_HIP_DEVICE_LIBS hip::device INTERFACE_LINK_LIBRARIES) + + string(REPLACE + "CLANGRT_BUILTINS-NOTFOUND" + "${ACTUAL_CLANGRT_BUILTINS}" + _NEW_HIP_HOST_LIBS + "${_HIP_HOST_LIBS}") + string(REPLACE + "CLANGRT_BUILTINS-NOTFOUND" + "${ACTUAL_CLANGRT_BUILTINS}" + _NEW_HIP_DEVICE_LIBS + "${_HIP_DEVICE_LIBS}") + + set_property(TARGET hip::host + PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_HOST_LIBS}) + set_property(TARGET hip::device + PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_DEVICE_LIBS}) +endmacro() diff --git a/include/El/blas_like/level1/Copy/Translate.hpp b/include/El/blas_like/level1/Copy/Translate.hpp index 55579afb13..db3f68718e 100644 --- a/include/El/blas_like/level1/Copy/Translate.hpp +++ b/include/El/blas_like/level1/Copy/Translate.hpp @@ -67,9 +67,15 @@ void Translate( const Int maxWidth = MaxLength(width, rowStride); const Int pkgSize = mpi::Pad(maxHeight*maxWidth); - simple_buffer buffer; - if(crossRank == root || crossRank == B.Root()) - buffer.allocate(pkgSize); + // When crossRank == root, this will be a SEND buffer + // (+SendRecv when !aligned), and it should use + // syncInfoA. When crossRank == B.Root(), this will be a RECV + // buffer, and it should use syncInfoB. Otherwise, this isn't + // used. + simple_buffer buffer = + (crossRank == root || crossRank == B.Root() + ? simple_buffer(pkgSize, syncInfoA) + : simple_buffer{}); const Int colAlignB = B.ColAlign(); const Int rowAlignB = B.RowAlign(); diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp index 4783359c9c..77cf807c28 100644 --- a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp +++ b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp @@ -10,6 +10,8 @@ #define EL_BLAS_COPY_TRANSLATEBETWEENGRIDS_HPP #include "core/environment/decl.hpp" +#include + namespace El { namespace copy @@ -3562,7 +3564,6 @@ void TranslateBetweenGridsAsync const Int mLocA = A.LocalHeight(); const Int nLocA = A.LocalWidth(); - mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); mpi::Group owningGroupA = A.Grid().OwningGroup(); @@ -3856,7 +3857,8 @@ void TranslateBetweenGrids( EL_DEBUG_CSE; /* Overview - + We broadcast the size of A to all the ranks in B to make sure that + all ranks in B subgrid has the correct size of A. Since we are using blocking communication, some care is required to avoid deadlocks. Let's start with a naive algorithm for [STAR,VC] matrices and optimize it in steps: @@ -3883,21 +3885,55 @@ void TranslateBetweenGrids( */ // Matrix dimensions - const Int m = A.Height(); - const Int n = A.Width(); + Int m = A.Height(); + Int n = A.Width(); + Int strideA = A.RowStride(); + Int ALDim = A.LDim(); + + mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); + + bool const inAGrid = A.Participating(); + bool const inBGrid = B.Participating(); + + Int recvMetaData[4]; + Int metaData[4]; + if(inAGrid) + { + metaData[0] = m; + metaData[1] = n; + metaData[2] = strideA; + metaData[3] = ALDim; + } + else + { + metaData[0] = 0; + metaData[1] = 0; + metaData[2] = 0; + metaData[3] = 0; + } + const std::vector sendMetaData (metaData, metaData + 4); + mpi::AllReduce(sendMetaData.data(), + recvMetaData, + 4, + mpi::MAX, + viewingCommB, + SyncInfo{}); + + m = recvMetaData[0]; + n = recvMetaData[1]; + strideA = recvMetaData[2]; + ALDim =recvMetaData[3]; + B.Resize(m, n); const Int nLocA = A.LocalWidth(); const Int nLocB = B.LocalWidth(); // Return immediately if there is no local data - const bool inAGrid = A.Participating(); - const bool inBGrid = B.Participating(); if (!inAGrid && !inBGrid) { return; } // Compute the number of messages to send/recv - const Int strideA = A.RowStride(); const Int strideB = B.RowStride(); const Int strideGCD = GCD(strideA, strideB); const Int numSends = Min(strideB/strideGCD, nLocA); @@ -3906,14 +3942,24 @@ void TranslateBetweenGrids( // Synchronize compute streams SyncInfo syncInfoA = SyncInfoFromMatrix(A.LockedMatrix()); SyncInfo syncInfoB = SyncInfoFromMatrix(B.Matrix()); - auto syncHelper = MakeMultiSync(syncInfoB, syncInfoA); - const SyncInfo& syncInfo = syncHelper; + + std::optional> maybeMultiSync; + if (inAGrid && inBGrid) + maybeMultiSync.emplace(syncInfoB, syncInfoA); + + SyncInfo const syncInfo = + (maybeMultiSync.has_value() + ? *maybeMultiSync + : (inAGrid ? syncInfoA : syncInfoB)); + + // Collective! + mpi::EnsureComm(viewingCommB, syncInfo); + mpi::EnsureComm(viewingCommB, syncInfo); // Translate the ranks from A's VC communicator to B's viewing so // that we can match send/recv communicators. Since A's VC // communicator is not necessarily defined on every process, we // instead work with A's owning group. - mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); mpi::Group owningGroupA = A.Grid().OwningGroup(); const int sizeA = A.Grid().Size(); vector viewingRanksA(sizeA), owningRanksA(sizeA); @@ -3976,7 +4022,7 @@ void TranslateBetweenGrids( // Copy data locally copy::util::InterleaveMatrix( m, messageWidth, - A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(), + A.LockedBuffer(0,jLocA), 1, numSends*ALDim, B.Buffer(0,jLocB), 1, numRecvs*B.LDim(), syncInfo); } @@ -3984,7 +4030,7 @@ void TranslateBetweenGrids( // Send data to other rank copy::util::InterleaveMatrix( m, messageWidth, - A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(), + A.LockedBuffer(0,jLocA), 1, numSends*ALDim, messageBuf.data(), 1, m, syncInfo); mpi::Send( diff --git a/include/El/core.hpp b/include/El/core.hpp index 173e0e6247..4009f55084 100644 --- a/include/El/core.hpp +++ b/include/El/core.hpp @@ -80,14 +80,10 @@ using hydrogen::gpu_half_type; #endif // HYDROGEN_GPU_USE_FP16 } -#if __cplusplus >= 201402L -#define H_DEPRECATED(msg) [[deprecated(msg)]] -#elif defined(__GNUC__) -// This ^ isn't perfect -- many non-GCC compilers define __GNUC__. -#define H_DEPRECATED(msg) __attribute__ ((deprecated(msg))) -#else +// NOTE: These have not been as inspirational as I had hoped. I'm +// leaving the notes but preprocessing them away so the compile +// warnings stop. #define H_DEPRECATED(msg) -#endif #define EL_UNUSED(expr) (void)(expr) diff --git a/include/El/core/Element/decl.hpp b/include/El/core/Element/decl.hpp index 72d30b19c2..7cf87f7415 100644 --- a/include/El/core/Element/decl.hpp +++ b/include/El/core/Element/decl.hpp @@ -9,6 +9,7 @@ #ifndef EL_ELEMENT_DECL_HPP #define EL_ELEMENT_DECL_HPP +#include #include #include @@ -262,7 +263,7 @@ template(alpha); } }; template @@ -272,6 +273,37 @@ struct Caster,void> { return Complex( T(RealPart(alpha)), T(ImagPart(alpha)) ); } }; +#if defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16 +#if defined HYDROGEN_HAVE_HALF +template <> +struct Caster +{ + static cpu_half_type Cast(gpu_half_type const& in) + { + return cpu_half_type(static_cast(in)); + } +};// Caster +#endif // defined HYDROGEN_HAVE_HALF + +template <> +struct Caster +{ + static double Cast(gpu_half_type const& in) + { + return static_cast(in); + } +};// Caster + +template <> +struct Caster> +{ + static Complex Cast(gpu_half_type const& alpha) + { + return Complex(static_cast(alpha), gpu_half_type(0)); + } +}; +#endif // defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16 + // Set the real/imaginary part of an element // ----------------------------------------- template #include +#include #include #include #include @@ -21,6 +22,7 @@ namespace El { namespace details { + template void ThrowRuntimeError(Args&&... args) { @@ -29,6 +31,25 @@ void ThrowRuntimeError(Args&&... args) (void) dummy; throw std::runtime_error(oss.str()); } + +/** @brief Returns true iff env(H_MEMPOOL_DEBUG) is truthy. + * + * Truthy values are non-empty strings that start with any character + * other than '0' (ASCII "zero"). So "true", "false", "1", "13", + * "-q", ":)", and " " are all truthy, while "", "0true", "0false", + * "0000", "0123", and "0:)" are all falsey. + */ +bool debug_mempool() noexcept; + +/** @brief Check env(H_MEMPOOL_BIN_GROWTH). Default 1.6f. */ +float default_mempool_bin_growth() noexcept; + +/** @brief Check env(H_MEMPOOL_MIN_BIN). Default 1UL. */ +size_t default_mempool_min_bin() noexcept; + +/** @brief Check env(H_MEMPOOL_MAX_BIN). Default (1<<26). */ +size_t default_mempool_max_bin() noexcept; + } // namespace details /** Simple caching memory pool. @@ -50,10 +71,13 @@ class MemoryPool * @param bin_growth Controls how fast bins grow. * @param min_bin_size Smallest bin size (in bytes). * @param max_bin_size Largest bin size (in bytes). + * @param debug Print debugging messages. */ - MemoryPool(float bin_growth = 1.6, - size_t min_bin_size = 1, - size_t max_bin_size = 1<<26) + MemoryPool(float const bin_growth = details::default_mempool_bin_growth(), + size_t const min_bin_size = details::default_mempool_min_bin(), + size_t const max_bin_size = details::default_mempool_max_bin(), + bool const debug = details::debug_mempool()) + : debug_{debug} { std::set bin_sizes; for (float bin_size = min_bin_size; @@ -74,23 +98,47 @@ class MemoryPool // Set up bins. for (size_t i = 0; i < bin_sizes_.size(); ++i) free_data_.emplace_back(); + if (debug_) + { + std::clog << "==Mempool(" << this << ")== " + << "Created memory pool (" + << "pinned=" << (Pinned ? "t" : "f") + << ", growth=" << bin_growth + << ", min bin=" << bin_sizes_.front() + << ", max bin=" << bin_sizes_.back() << ")\n" + << "==Mempool(" << this << ")== " + << "Bin sizes: ["; + for (auto const& b : bin_sizes_) + std::clog << " " << b; + std::clog << " ]" << std::endl; + } } ~MemoryPool() { FreeAllUnused(); + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << alloc_to_bin_.size() + << " dangling allocations\n" + << "==Mempool(" << this << ")== " + << "Destroyed memory pool" + << std::endl; } /** Return memory of size bytes. */ void* Allocate(size_t size) { - size_t bin = get_bin(size); + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Requesting allocation of " + << size << " bytes." + << std::endl; + size_t const bin = get_bin(size); void* mem = nullptr; std::lock_guard lock(mutex_); // size is too large, this will not be cached. if (bin == INVALID_BIN) - { mem = do_allocation(size); - } else { // Check if there is available memory in our bin. @@ -98,6 +146,10 @@ class MemoryPool { mem = free_data_[bin].back(); free_data_[bin].pop_back(); + --num_cached_blks_; + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Reusing cached pointer " << mem << "\n"; } else { @@ -105,41 +157,45 @@ class MemoryPool } } alloc_to_bin_[mem] = bin; + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << alloc_to_bin_.size() + << " blocks allocated; " + << num_cached_blks_ + << " blocks cached" + << std::endl; + return mem; } /** Release previously allocated memory. */ void Free(void* ptr) { std::lock_guard lock(mutex_); - auto iter = alloc_to_bin_.find(ptr); + auto const iter = alloc_to_bin_.find(ptr); if (iter == alloc_to_bin_.end()) - { details::ThrowRuntimeError("Tried to free unknown ptr"); - } + + size_t const& bin = iter->second; + alloc_to_bin_.erase(iter); + if (bin == INVALID_BIN) + do_free(ptr); else { - size_t bin = iter->second; - alloc_to_bin_.erase(iter); - if (bin == INVALID_BIN) - { - do_free(ptr); - } - else - { - // Cache the pointer for reuse. - free_data_[bin].push_back(ptr); - } + // Cache the pointer for reuse. + free_data_[bin].push_back(ptr); + ++num_cached_blks_; + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Cached pointer " << ptr << "\n"; } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << alloc_to_bin_.size() + << " blocks allocated; " + << num_cached_blks_ + << " blocks cached" + << std::endl; } - /** Release all unused memory. */ - void FreeAllUnused() - { - std::lock_guard lock(mutex_); - for (size_t bin = 0; bin < bin_sizes_.size(); ++bin) - for (auto&& ptr : free_data_[bin]) - do_free(ptr); - } - private: /** Index of an invalid bin. */ @@ -158,6 +214,25 @@ class MemoryPool /** Map used pointers to the associated bin index. */ std::unordered_map alloc_to_bin_; + /** Track the total number of available blocks. */ + size_t num_cached_blks_; + + /** Print debugging messages throughout lifetime. */ + bool debug_; + + /** Release all unused memory. */ + void FreeAllUnused() + { + std::lock_guard lock(mutex_); + for (size_t bin = 0; bin < bin_sizes_.size(); ++bin) + { + for (auto&& ptr : free_data_[bin]) + do_free(ptr); + std::vector{}.swap(free_data_[bin]); + } + num_cached_blks_ = 0ul; + } + /** Allocate size bytes. */ inline void* do_allocation(size_t size); /** Free ptr. */ @@ -179,7 +254,7 @@ class MemoryPool #ifdef HYDROGEN_HAVE_CUDA template <> -inline void* MemoryPool::do_allocation(size_t bytes) +inline void* MemoryPool::do_allocation(size_t const bytes) { void* ptr; auto error = cudaMallocHost(&ptr, bytes); @@ -189,11 +264,15 @@ inline void* MemoryPool::do_allocation(size_t bytes) "Failed to allocate CUDA pinned memory with message: ", "\"", cudaGetErrorString(error), "\""); } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Allocated pinned " << bytes << " bytes at " << ptr + << std::endl; return ptr; } template<> -inline void MemoryPool::do_free(void* ptr) +inline void MemoryPool::do_free(void* const ptr) { auto error = cudaFreeHost(ptr); if (error != cudaSuccess) @@ -202,49 +281,64 @@ inline void MemoryPool::do_free(void* ptr) "Failed to free CUDA pinned memory with message: ", "\"", cudaGetErrorString(error), "\""); } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Freed pinned ptr " << ptr + << std::endl; } #elif defined(HYDROGEN_HAVE_ROCM) template <> -inline void* MemoryPool::do_allocation(size_t bytes) +inline void* MemoryPool::do_allocation(size_t const bytes) { void* ptr; auto error = hipHostMalloc(&ptr, bytes); if (error != hipSuccess) - { details::ThrowRuntimeError( "Failed to allocate HIP pinned memory with message: ", "\"", hipGetErrorString(error), "\""); - } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Allocated pinned " << bytes << " bytes at " << ptr + << std::endl; return ptr; } template<> -inline void MemoryPool::do_free(void* ptr) +inline void MemoryPool::do_free(void* const ptr) { auto error = hipHostFree(ptr); if (error != hipSuccess) - { details::ThrowRuntimeError( "Failed to free HIP pinned memory with message: ", "\"", hipGetErrorString(error), "\""); - } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Freed pinned ptr " << ptr + << std::endl; } #endif // HYDROGEN_HAVE_CUDA template <> -inline void* MemoryPool::do_allocation(size_t bytes) { +inline void* MemoryPool::do_allocation(size_t const bytes) +{ void* ptr = std::malloc(bytes); if (ptr == nullptr) - { details::ThrowRuntimeError("Failed to allocate memory"); - } + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Allocated " << bytes << " bytes at " << ptr + << std::endl; return ptr; } template<> -inline void MemoryPool::do_free(void* ptr) +inline void MemoryPool::do_free(void* const ptr) { - return std::free(ptr); + std::free(ptr); + if (debug_) + std::clog << "==Mempool(" << this << ")== " + << "Freed ptr " << ptr + << std::endl; } #ifdef HYDROGEN_HAVE_GPU diff --git a/include/El/core/Profiling.hpp b/include/El/core/Profiling.hpp index a09199ff17..45d2a7181b 100644 --- a/include/El/core/Profiling.hpp +++ b/include/El/core/Profiling.hpp @@ -19,6 +19,10 @@ void DisableVTune() noexcept; void EnableNVProf() noexcept; void DisableNVProf() noexcept; +// These are no-ops if roctracer is not enabled at compile time +void EnableROCTX() noexcept; +void DisableROCTX() noexcept; + /** \brief A selection of colors to use with the profiling interface. * * It seems unlikely that a user will ever need to access these by diff --git a/include/El/core/environment/decl.hpp b/include/El/core/environment/decl.hpp index 8426ee1f9d..b9367fde42 100644 --- a/include/El/core/environment/decl.hpp +++ b/include/El/core/environment/decl.hpp @@ -35,7 +35,10 @@ using std::ostream; using std::ostringstream; using std::exception; -using std::uncaught_exception; +inline bool uncaught_exception() noexcept +{ + return std::uncaught_exceptions() > 0; +} void PrintVersion( ostream& os=cout ); void PrintConfig( ostream& os=cout ); diff --git a/include/El/core/imports/aluminum.hpp b/include/El/core/imports/aluminum.hpp index 6cc8391727..67da59d507 100644 --- a/include/El/core/imports/aluminum.hpp +++ b/include/El/core/imports/aluminum.hpp @@ -31,7 +31,11 @@ enum class Collective REDUCE, REDUCESCATTER, SCATTER, - SENDRECV + + // Not collectives, but what can you do + SENDRECV, + SEND, + RECV, };// enum class Collective #ifndef HYDROGEN_HAVE_ALUMINUM @@ -115,6 +119,8 @@ ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::MPIBackend); ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::MPIBackend); ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::MPIBackend); ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::MPIBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::MPIBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::MPIBackend); #ifdef HYDROGEN_HAVE_NCCL2 // NCCL backend supports these @@ -126,7 +132,9 @@ ADD_ALUMINUM_COLLECTIVE( Collective::GATHER, Al::NCCLBackend); ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::NCCLBackend); ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::NCCLBackend); ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::NCCLBackend); -//ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::NCCLBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::NCCLBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::NCCLBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::NCCLBackend); #endif // HYDROGEN_HAVE_NCCL2 #ifdef HYDROGEN_HAVE_AL_HOST_XFER @@ -140,6 +148,8 @@ ADD_ALUMINUM_COLLECTIVE( Collective::REDUCE, Al::HostTransferBackend); ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::HostTransferBackend); ADD_ALUMINUM_COLLECTIVE( Collective::SCATTER, Al::HostTransferBackend); ADD_ALUMINUM_COLLECTIVE( Collective::SENDRECV, Al::HostTransferBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::SEND, Al::HostTransferBackend); +ADD_ALUMINUM_COLLECTIVE( Collective::RECV, Al::HostTransferBackend); #endif // HYDROGEN_HAVE_AL_HOST_XFER template @@ -349,10 +359,26 @@ struct SyncInfoManager }; #endif // HYDROGEN_HAVE_GPU +inline bool use_separate_comm_stream() noexcept +{ + char const* const env = std::getenv("H_USE_SEPARATE_COMM_STREAM"); + return (env && std::strlen(env) && env[0] != '0'); +} + template SyncInfo()> const& BackendSyncInfo() { constexpr Device D = DeviceForBackend(); +#ifdef HYDROGEN_HAVE_GPU + if constexpr (D == El::Device::GPU) + { + static bool const use_separate_stream = use_separate_comm_stream(); + if (!use_separate_stream) + { + return El::gpu::DefaultSyncInfo(); + } + } +#endif // HYDROGEN_HAVE_GPU static SyncInfoManager si_mgr_(BackendT::Name()); return si_mgr_.si_; } diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index d51c4d1384..a2527fd27c 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -173,6 +173,15 @@ extern template struct Types; // Avoid conflict with Int #undef PROTO #endif // !defined H_INSTANTIATING_MPI_TYPES_STRUCT +#ifdef HYDROGEN_HAVE_HALF +extern template struct Types; +extern template struct Types>; +#endif +#ifdef HYDROGEN_GPU_USE_FP16 +extern template struct Types; +extern template struct Types>; +#endif + template struct MPIBaseHelper { typedef T value; }; template diff --git a/include/hydrogen/PoolAllocator.hpp b/include/hydrogen/PoolAllocator.hpp new file mode 100644 index 0000000000..b251a7171f --- /dev/null +++ b/include/hydrogen/PoolAllocator.hpp @@ -0,0 +1,1145 @@ +// See LICENSE for Hydrogen license. Original license for CUB follows: +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Extended asynchronous pooling allocator with exponential, multiplicative, and + * user-specified bin sizes. This allocator is based on CUB's pooling allocator + * and can use {cuda,hip}MallocAsync as necessary. It also provides extensive + * reporting for allocations, bins, and extraneous memory. + ******************************************************************************/ + +#ifndef HYDROGEN_POOLALLOCATOR_HPP_ +#define HYDROGEN_POOLALLOCATOR_HPP_ + +#include +#include +#include +#include +#include + +#include + +// Set up functions +#ifdef HYDROGEN_HAVE_CUDA +#include + +#define gpuMallocAsync cudaMallocAsync +#define gpuMalloc cudaMalloc +#define gpuFreeAsync cudaFreeAsync +#define gpuFree cudaFree +#define gpuSetDevice cudaSetDevice +#define gpuGetDevice cudaGetDevice +#define gpuEventQuery cudaEventQuery +#define gpuGetLastError cudaGetLastError +#define gpuEventDestroy cudaEventDestroy +#define gpuEventCreateWithFlags cudaEventCreateWithFlags +#define gpuEventDisableTiming cudaEventDisableTiming +#define gpuEventRecord cudaEventRecord +#define gpuGetErrorString cudaGetErrorString + +#define gpuStream_t cudaStream_t +#define gpuEvent_t cudaEvent_t +#define gpuError_t cudaError_t + +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuErrorMemoryAllocation cudaErrorMemoryAllocation + +#ifndef GPU_PTX_ARCH +#ifndef __CUDA_ARCH__ +#define GPU_PTX_ARCH 0 +#else +#define GPU_PTX_ARCH __CUDA_ARCH__ +#endif +#endif + +#elif defined(HYDROGEN_HAVE_ROCM) +#include + +#define gpuMallocAsync hipMallocAsync +#define gpuMalloc hipMalloc +#define gpuFreeAsync hipFreeAsync +#define gpuFree hipFree +#define gpuSetDevice hipSetDevice +#define gpuGetDevice hipGetDevice +#define gpuEventQuery hipEventQuery +#define gpuGetLastError hipGetLastError +#define gpuEventDestroy hipEventDestroy +#define gpuEventCreateWithFlags hipEventCreateWithFlags +#define gpuEventDisableTiming hipEventDisableTiming +#define gpuEventRecord hipEventRecord +#define gpuGetErrorString hipGetErrorString + +#define gpuStream_t hipStream_t +#define gpuEvent_t hipEvent_t +#define gpuError_t hipError_t + +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuErrorMemoryAllocation hipErrorMemoryAllocation + +#ifndef GPU_PTX_ARCH +#ifndef __HIP_ARCH__ +#define GPU_PTX_ARCH 0 +#else +#define GPU_PTX_ARCH 1 +#endif +#endif +#else +#error "This file must be included with a GPU (CUDA/ROCm) environment" +#endif + +namespace gpudebug { +/* Minimal copy of CubDebug */ +__host__ __device__ __forceinline__ gpuError_t Debug(gpuError_t error, + const char *filename, + int line) { + if (error) { +#if (GPU_PTX_ARCH == 0) + fprintf(stderr, "GPU error %d [%s, %d]: %s\n", error, filename, line, + gpuGetErrorString(error)); + fflush(stderr); +#else + printf("GPU error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", + error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, + threadIdx.x, filename, line); +#endif + } + return error; +} +} // namespace gpudebug + +/** + * \brief Debug macro + */ +#ifndef gpuDebug +#define gpuDebug(e) gpudebug::Debug((gpuError_t)(e), __FILE__, __LINE__) +#endif + +/** + * Prints human-readable size (for reporting) + */ +static inline void HumanReadableSize(size_t bytes, std::ostream &os) { + const std::string sizes[] = {"B", "KiB", "MiB", "GiB", "TiB"}; + int unit = 0; + float size = bytes; + while (size > 1024) { + size /= 1024; + ++unit; + } + auto oldprec = os.precision(unit > 0 ? 2 : 0); + auto oldf = os.setf(std::ios_base::fixed, std::ios_base::floatfield); + os << size << " " << sizes[unit]; + os.precision(oldprec); + os.setf(oldf); +} + +namespace hydrogen { + +/****************************************************************************** + * PooledDeviceAllocator + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and stream-safe and is capable of managing + * cached device allocations on multiple devices. It behaves as follows: + * + * \par + * - Allocations from the allocator are associated with an \p active_stream. + * Once freed, the allocation becomes available immediately for reuse within the + * \p active_stream with which it was associated with during allocation, and it + * becomes available for reuse within other streams when all prior work + * submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation + * request of a given size will only consider cached allocations within the + * corresponding bin. + * - (EXTENDED) Bin limits have a combined geometric/linear progression; or can + * be given as a set of sizes. It behaves as follows: + * - If a set of sizes is given in \p bin_sizes, they are used to construct + * the allocation bins. If an allocation is larger than the largest bin, the + * behavior matches the rest of the algorithm. Allocations in [0, bin_min] + * will allocate ``bin_min`` bytes. + * - Bin limits progress geometrically in accordance with the (integer) + * growth factor \p bin_growth provided during construction. Unused device + * allocations within a larger bin cache are not reused for allocation + * requests that categorize to smaller bin sizes. + * Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up + * to ( \p bin_growth ^ \p min_bin ). + * - If two consecutive geometric bins exceed \p bin_mult_threshold and + * \p bin_mult is given, a linear binning scheme is created where bins + * follow \p bin_growth ^ some_bin + \p bin_mult * n + * - Allocations above min( \p bin_growth ^ \p max_bin , \p + * max_bin_alloc_size ) are not rounded up to the nearest bin and are simply + * freed when they are deallocated instead of being returned to a bin-cache. + * - If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they + * are deallocated instead of being returned to their bin-cache. + * + */ +struct PooledDeviceAllocator { + + //--------------------------------------------------------------------- + // Constants + //--------------------------------------------------------------------- + + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int)-1; + + /// Invalid size + static const size_t INVALID_SIZE = (size_t)-1; + + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; + + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor { + void *d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + size_t requested_bytes; // Size of true allocation in bytes + bool binned; // Whether the block is part of the pool bins + int device; // device ordinal + gpuStream_t associated_stream; // Associated associated_stream + gpuEvent_t ready_event; // Signal when associated stream has run to the + // point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its + // pointer and device) + BlockDescriptor(void *d_ptr, int device) + : d_ptr(d_ptr), bytes(0), requested_bytes(0), binned(false), + device(device), associated_stream(0), ready_event(0) {} + + // Constructor (suitable for searching maps for a range of suitable blocks, + // given a device) + BlockDescriptor(int device) + : d_ptr(NULL), bytes(0), requested_bytes(0), binned(false), + device(device), associated_stream(0), ready_event(0) {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { + if (a.device == b.device) + return (a.d_ptr < b.d_ptr); + else + return (a.device < b.device); + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, + const BlockDescriptor &b) { + if (a.device == b.device) + return (a.bytes < b.bytes); + else + return (a.device < b.device); + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each + /// device + typedef std::map GpuCachedBytes; + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow(unsigned int base, unsigned int exp) { + unsigned int retval = 1; + while (exp > 0) { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, + unsigned int base, size_t value) { + power = 0; + rounded_bytes = 1; + + if (value * base < value) { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) { + rounded_bytes *= base; + power++; + } + } + + size_t NearestMultOf(unsigned int mult, size_t value) { + // Ceiling division followed by multiplication + return ((value + mult - 1) / mult) * mult; + } + + unsigned int ComputeLinearBinIndex(unsigned int bin_growth, + unsigned int bin_mult_threshold) { + if (bin_mult_threshold == INVALID_BIN || bin_growth == 0 || + bin_mult_threshold == 0) + return INVALID_BIN; + + return static_cast(std::log(bin_mult_threshold) / + std::log(bin_growth)); + } + + size_t ComputeMaxBinBytes(unsigned int bin_growth, unsigned int max_bin, + size_t max_bin_alloc_size) { + size_t result = INVALID_SIZE; + if (max_bin != INVALID_BIN) { + result = IntPow(bin_growth, max_bin); + } + if (max_bin_alloc_size != INVALID_SIZE) { + result = std::min(result, max_bin_alloc_size); + } + return result; + } + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + std::mutex mutex; /// Mutex for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + // Extensions + unsigned int bin_mult_threshold; /// Threshold to switch between geometric and + /// linear growth + unsigned int bin_mult; /// Linear bin scaling size + size_t max_bin_alloc_size; /// Maximal binned allocation size + std::set bin_sizes; /// Explicit control over bin sizes + + unsigned int linear_bin_index; /// Geometric bin to consider linear binning + /// from (computed) + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + const bool + skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when + /// destructor is called. (The runtime may have already + /// shut down for statically declared allocators) + bool debug; /// Whether or not to print (de)allocation events to stdout + bool malloc_async; /// Use {cuda,hip}MallocAsync + + std::set actual_bin_sizes; /// Bin sizes used by the allocator + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached + /// bytes on that device + CachedBlocks + cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + PooledDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = + INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = + INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default + ///< is no limit) + bool skip_cleanup = + false, ///< Whether or not to skip a call to \p FreeAllCached() when + ///< the destructor is called (default is to deallocate) + bool debug = false, ///< Whether or not to print (de)allocation events to + ///< stdout (default is no stderr output) + unsigned int bin_mult_threshold = + INVALID_BIN, ///< Threshold to switch between geometric and linear + ///< growth + unsigned int bin_mult = INVALID_BIN, ///< Linear bin scaling size + size_t max_bin_alloc_size = + INVALID_SIZE, ///< Maximal binned allocation size + std::set bin_sizes = {}, ///< Explicit control over bin size + bool use_malloc_async = false) ///< Use asynchronous malloc/free calls + : bin_growth(bin_growth), min_bin(min_bin), max_bin(max_bin), + bin_mult_threshold(bin_mult_threshold), bin_mult(bin_mult), + bin_sizes(bin_sizes), + linear_bin_index(ComputeLinearBinIndex(bin_growth, bin_mult_threshold)), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes( + ComputeMaxBinBytes(bin_growth, max_bin, max_bin_alloc_size)), + max_cached_bytes(max_cached_bytes), skip_cleanup(skip_cleanup), + debug(debug), malloc_async(use_malloc_async), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) {} + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = + * 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + PooledDeviceAllocator(bool skip_cleanup = false, bool debug = false) + : bin_growth(8), min_bin(3), max_bin(7), bin_mult_threshold(INVALID_BIN), + bin_mult(INVALID_BIN), bin_sizes{}, linear_bin_index(INVALID_BIN), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1), skip_cleanup(skip_cleanup), + debug(debug), malloc_async(false), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) {} + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to + * cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use + * or cached-in-reserve) to be freed. See \p FreeAllCached(). + */ + gpuError_t SetMaxCachedBytes(size_t max_cached_bytes_) { + // Lock + mutex.lock(); + + if (debug) + printf("Changing max_cached_bytes (%lld -> %lld)\n", + (long long)this->max_cached_bytes, (long long)max_cached_bytes_); + + this->max_cached_bytes = max_cached_bytes_; + + // Unlock + mutex.unlock(); + + return gpuSuccess; + } + + /** + * \brief Implements the bin-finding algorithm described in the class + * documentation. Returns true if a bin was found, or false otherwise. + */ + bool FindBin(BlockDescriptor &search_key) { + size_t bytes = search_key.requested_bytes; + search_key.bytes = bytes; + + if (bytes > max_bin_bytes) { + // Size is greater than our preconfigured maximum: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + return false; + } + + // If a custom bin histogram is given, use that + auto it = bin_sizes.lower_bound(bytes); + if (it != bin_sizes.end()) { + search_key.bytes = *it; + return true; + } + + // Find geometric bin + unsigned int geobin; + NearestPowerOf(geobin, search_key.bytes, bin_growth, bytes); + // Minimum bin + if (geobin < min_bin) { + // Bin is less than minimum bin: round up + search_key.bytes = min_bin_bytes; + return true; + } + + // Test for linear binning; if so, find linear bin + if (linear_bin_index != INVALID_BIN && geobin >= linear_bin_index) { + search_key.bytes = NearestMultOf(bin_mult, bytes); + return true; + } + + // Otherwise, use geometric bin + if (geobin > max_bin) { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + return false; + } + + // search_key.bytes was set above by NearestPowerOf + return true; + } + + /** + * \brief Provides a suitable allocation of device memory for the given size + * on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within + * the \p active_stream with which it was associated with during allocation, + * and it becomes available for reuse within other streams when all prior work + * submitted to \p active_stream has completed. + */ + gpuError_t DeviceAllocate( + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + gpuStream_t active_stream = + 0) ///< [in] The stream to be associated with this allocation + { + *d_ptr = NULL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + gpuError_t error = gpuSuccess; + + if (device == INVALID_DEVICE_ORDINAL) { + if (gpuDebug(error = gpuGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key(device); + search_key.associated_stream = active_stream; + search_key.requested_bytes = bytes; + bool binned = FindBin(search_key); + search_key.binned = binned; + + if (binned) { + // Search for a suitable cached allocation: lock + mutex.lock(); + + // Add bin size to created bin sizes + actual_bin_sizes.insert(search_key.bytes); + + // Iterate through the range of cached blocks on the same device in the + // same bin + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + while ((block_itr != cached_blocks.end()) && + (block_itr->device == device) && + (block_itr->bytes == search_key.bytes)) { + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + bool is_reusable = false; + if (active_stream == block_itr->associated_stream) { + is_reusable = true; + } else { + const gpuError_t event_status = gpuEventQuery(block_itr->ready_event); + if (event_status != gpuErrorNotReady) { + static_cast(gpuDebug(event_status)); + is_reusable = true; + } + } + + if (is_reusable) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.requested_bytes = bytes; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].live += search_key.bytes; + + if (debug) + printf("\tDevice %d reused cached block at %p (%lld bytes) for " + "stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; + } + + // Done searching: unlock + mutex.unlock(); + } + + // Allocate the block if necessary + if (!found) { + // Set runtime's current device to specified device (entrypoint may not be + // set) + if (device != entrypoint_device) { + if (gpuDebug(error = gpuGetDevice(&entrypoint_device))) + return error; + if (gpuDebug(error = gpuSetDevice(device))) + return error; + } + + // Attempt to allocate + if (gpuDebug(error = MallocInternal(&search_key.d_ptr, search_key.bytes, + active_stream)) == + gpuErrorMemoryAllocation) { + // The allocation attempt failed: free all cached blocks on device and + // retry + if (debug) + printf("\tDevice %d failed to allocate %lld bytes for stream %lld, " + "retrying after freeing cached allocations", + device, (long long)search_key.bytes, + (long long)search_key.associated_stream); + + error = gpuSuccess; // Reset the error we will return + static_cast(gpuGetLastError()); // Reset error + + // Lock + mutex.lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key(device); + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end()) && + (block_itr->device == device)) { + // No need to worry about synchronization with the device: gpuFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + if (gpuDebug(error = FreeInternal(block_itr->d_ptr, + block_itr->associated_stream))) + break; + if (gpuDebug(error = gpuEventDestroy(block_itr->ready_event))) + break; + + // Reduce balance and erase entry + cached_bytes[device].free -= block_itr->bytes; + + if (debug) + printf("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks " + "cached (%lld bytes), %lld live blocks (%lld bytes) " + "outstanding.\n", + device, (long long)block_itr->bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + + block_itr = cached_blocks.erase(block_itr); + } + + // Unlock + mutex.unlock(); + + // Return under error + if (error) + return error; + + // Try to allocate again + if (gpuDebug(error = MallocInternal(&search_key.d_ptr, search_key.bytes, + active_stream))) + return error; + } + + search_key.associated_stream = active_stream; + + // Create ready event + if (gpuDebug(error = gpuEventCreateWithFlags(&search_key.ready_event, + gpuEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.lock(); + live_blocks.insert(search_key); + cached_bytes[device].live += search_key.bytes; + mutex.unlock(); + + if (debug) + printf("\tDevice %d allocated new device block at %p (%lld bytes " + "associated with stream %lld).\n", + device, search_key.d_ptr, (long long)search_key.bytes, + (long long)search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && + (entrypoint_device != device)) { + if (gpuDebug(error = gpuSetDevice(entrypoint_device))) + return error; + } + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + + if (debug) + printf( + "\t\t%lld available blocks cached (%lld bytes), %lld live blocks " + "outstanding(%lld bytes).\n", + (long long)cached_blocks.size(), (long long)cached_bytes[device].free, + (long long)live_blocks.size(), (long long)cached_bytes[device].live); + + return error; + } + + /** + * \brief Provides a suitable allocation of device memory for the given size + * on the current device. + * + * Once freed, the allocation becomes available immediately for reuse within + * the \p active_stream with which it was associated with during allocation, + * and it becomes available for reuse within other streams when all prior work + * submitted to \p active_stream has completed. + */ + gpuError_t DeviceAllocate( + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + gpuStream_t active_stream = + 0) ///< [in] The stream to be associated with this allocation + { + return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); + } + + /** + * \brief Frees a live allocation of device memory on the specified device, + * returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within + * the \p active_stream with which it was associated with during allocation, + * and it becomes available for reuse within other streams when all prior work + * submitted to \p active_stream has completed. + */ + gpuError_t DeviceFree(int device, void *d_ptr) { + int entrypoint_device = INVALID_DEVICE_ORDINAL; + gpuError_t error = gpuSuccess; + + if (device == INVALID_DEVICE_ORDINAL) { + if (gpuDebug(error = gpuGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Lock + mutex.lock(); + + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr, device); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the + // max cached threshold + if (search_key.binned && + (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + + if (debug) + printf("\tDevice %d returned %lld bytes from associated stream " + "%lld.\n\t\t %lld available blocks cached (%lld bytes), %lld " + "live blocks outstanding. (%lld bytes)\n", + device, (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } + } + + // Unlock + mutex.unlock(); + + // First set to specified device (entrypoint may not be set) + if (device != entrypoint_device) { + if (gpuDebug(error = gpuGetDevice(&entrypoint_device))) + return error; + if (gpuDebug(error = gpuSetDevice(device))) + return error; + } + + if (recached) { + // Insert the ready event in the associated stream (must have current + // device set properly) + if (gpuDebug(error = gpuEventRecord(search_key.ready_event, + search_key.associated_stream))) + return error; + } + + if (!recached) { + // Free the allocation from the runtime and cleanup the event. + if (gpuDebug(error = FreeInternal(d_ptr, search_key.associated_stream))) + return error; + if (gpuDebug(error = gpuEventDestroy(search_key.ready_event))) + return error; + + if (debug) + printf("\tDevice %d freed %lld bytes from associated stream " + "%lld.\n\t\t %lld available blocks cached (%lld bytes), %lld " + "live blocks (%lld bytes) outstanding.\n", + device, (long long)search_key.bytes, + (long long)search_key.associated_stream, + (long long)cached_blocks.size(), + (long long)cached_bytes[device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[device].live); + } + + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && + (entrypoint_device != device)) { + if (gpuDebug(error = gpuSetDevice(entrypoint_device))) + return error; + } + + return error; + } + + /** + * \brief Frees a live allocation of device memory on the current device, + * returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within + * the \p active_stream with which it was associated with during allocation, + * and it becomes available for reuse within other streams when all prior work + * submitted to \p active_stream has completed. + */ + gpuError_t DeviceFree(void *d_ptr) { + return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); + } + + /** + * \brief Frees all cached device allocations on all devices + */ + gpuError_t FreeAllCached() { + gpuError_t error = gpuSuccess; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + mutex.lock(); + + while (!cached_blocks.empty()) { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) { + if (gpuDebug(error = gpuGetDevice(&entrypoint_device))) + break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) { + if (gpuDebug(error = gpuSetDevice(begin->device))) + break; + current_device = begin->device; + } + + // Free device memory + if (gpuDebug(error = + FreeInternal(begin->d_ptr, begin->associated_stream))) + break; + if (gpuDebug(error = gpuEventDestroy(begin->ready_event))) + break; + + // Reduce balance and erase entry + const size_t block_bytes = begin->bytes; + cached_bytes[current_device].free -= block_bytes; + cached_blocks.erase(begin); + + if (debug) + printf( + "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached " + "(%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + current_device, (long long)block_bytes, + (long long)cached_blocks.size(), + (long long)cached_bytes[current_device].free, + (long long)live_blocks.size(), + (long long)cached_bytes[current_device].live); + } + + mutex.unlock(); + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) { + if (gpuDebug(error = gpuSetDevice(entrypoint_device))) + return error; + } + + return error; + } + + /** + * \brief Destructor + */ + virtual ~PooledDeviceAllocator() { + if (!skip_cleanup) + static_cast(FreeAllCached()); + } + + /* Inspection and reporting methods */ + + size_t TotalAllocatedMemory(int device = INVALID_DEVICE_ORDINAL) const { + size_t result = 0; + for (auto const &[dev, totals] : cached_bytes) { + if (device != INVALID_DEVICE_ORDINAL && device != dev) + continue; + result += totals.live + totals.free; + } + return result; + } + + size_t FreeMemory(int device = INVALID_DEVICE_ORDINAL) const { + size_t result = 0; + for (auto const &[dev, totals] : cached_bytes) { + if (device != INVALID_DEVICE_ORDINAL && device != dev) + continue; + result += totals.free; + } + return result; + } + + size_t GetBinFreeMemory(int device = INVALID_DEVICE_ORDINAL, + size_t bin_size = INVALID_SIZE) const { + size_t result = 0; + for (BlockDescriptor const &desc : cached_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + if (desc.bytes == bin_size) + result += desc.bytes; + } + return result; + } + + size_t LiveMemory(int device = INVALID_DEVICE_ORDINAL) const { + size_t result = 0; + for (auto const &[dev, totals] : cached_bytes) { + if (device != INVALID_DEVICE_ORDINAL && device != dev) + continue; + result += totals.live; + } + return result; + } + + size_t GetBinLiveMemory(int device = INVALID_DEVICE_ORDINAL, + size_t bin_size = INVALID_SIZE) const { + size_t result = 0; + for (BlockDescriptor const &desc : live_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + if (desc.bytes == bin_size) + result += desc.bytes; + } + return result; + } + + size_t NonbinnedMemory(int device = INVALID_DEVICE_ORDINAL) const { + size_t result = 0; + for (BlockDescriptor const &desc : live_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + if (!desc.binned) + result += desc.bytes; + } + return result; + } + + size_t ExcessMemory(int device = INVALID_DEVICE_ORDINAL, + size_t bin_size = INVALID_SIZE) const { + size_t result = 0; + for (BlockDescriptor const &desc : live_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + if (bin_size != INVALID_SIZE && desc.bytes != bin_size) + continue; + result += desc.bytes - desc.requested_bytes; + } + return result; + } + + size_t GetNumBuffers(int device = INVALID_DEVICE_ORDINAL, bool cached = true, + bool live = true) const { + size_t result = 0; + if (cached) { + for (BlockDescriptor const &desc : cached_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + ++result; + } + } + if (live) { + for (BlockDescriptor const &desc : live_blocks) { + if (device != INVALID_DEVICE_ORDINAL && desc.device != device) + continue; + ++result; + } + } + return result; + } + + void Report(std::ostream &os, bool report_bins = true) const { + os << "Memory pool configuration:" << std::endl; + os << " Geometric bins - " << bin_growth << " ^ (" << min_bin << "-" + << ((max_bin == INVALID_BIN) ? "inf" : std::to_string(max_bin)) + << ")" << std::endl; + if (bin_mult_threshold == INVALID_BIN) { + os << " Linear bins - DISABLED" << std::endl; + } else { + os << " Linear bins - when geometric bin difference > " + << bin_mult_threshold << ", allocate in multiples of " << bin_mult + << std::endl; + } + + if (bin_sizes.size() == 0) { + os << " Custom bins - NONE" << std::endl; + } else { + os << " Custom bins - "; + bool first = true; + for (auto const &bin : bin_sizes) { + if (!first) + os << ", "; + HumanReadableSize(bin, os); + first = false; + } + os << std::endl; + } + os << " mallocAsync: " << (malloc_async ? "enabled" : "disabled") + << ", debug: " << (debug ? "enabled" : "disabled") + << ", skip cleanup: " << (skip_cleanup ? "yes" : "no") << std::endl; + + for (auto const &[dev, totals] : cached_bytes) { + os << "Memory pool allocation report (Device " << dev + << "):" << std::endl; + os << " Allocated memory: "; + HumanReadableSize(totals.live + totals.free, os); + os << " (Live: "; + HumanReadableSize(totals.live, os); + os << ", free: "; + HumanReadableSize(totals.free, os); + os << "). Buffers: " << GetNumBuffers() << std::endl; + os << " Total excess memory due to binning: "; + HumanReadableSize(ExcessMemory(dev), os); + os << std::endl; + + if (report_bins) { + os << " Detailed bin report:" << std::endl; + for (auto const &bin : actual_bin_sizes) { + os << " "; + HumanReadableSize(bin, os); + os << ": Live = "; + HumanReadableSize(GetBinLiveMemory(dev, bin), os); + os << ", Free = "; + HumanReadableSize(GetBinFreeMemory(dev, bin), os); + os << ", Excess = "; + HumanReadableSize(ExcessMemory(dev, bin), os); + os << std::endl; + } + os << " Non-binned: "; + HumanReadableSize(NonbinnedMemory(dev), os); + os << std::endl; + } + } + } + +private: + gpuError_t MallocInternal(void **ptr, size_t size, + gpuStream_t active_stream) { + if (malloc_async) { + return gpuMallocAsync(ptr, size, active_stream); + } else { + return gpuMalloc(ptr, size); + } + } + + gpuError_t FreeInternal(void *ptr, gpuStream_t active_stream) { + if (malloc_async) { + return gpuFreeAsync(ptr, active_stream); + } else { + return gpuFree(ptr); + } + } +}; + +} // namespace hydrogen + +#undef gpuMallocAsync +#undef gpuMalloc +#undef gpuFreeAsync +#undef gpuFree +#undef gpuSetDevice +#undef gpuGetDevice +#undef gpuEventQuery +#undef gpuGetLastError +#undef gpuEventDestroy +#undef gpuEventCreateWithFlags +#undef gpuEventDisableTiming +#undef gpuEventRecord +#undef gpuGetErrorString +#undef gpuStream_t +#undef gpuEvent_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuErrorMemoryAllocation +#undef GPU_ARCH + +#endif // HYDROGEN_POOLALLOCATOR_HPP_ diff --git a/include/hydrogen/SynchronizeAPI.hpp b/include/hydrogen/SynchronizeAPI.hpp index 339bcf355e..b9261018cd 100644 --- a/include/hydrogen/SynchronizeAPI.hpp +++ b/include/hydrogen/SynchronizeAPI.hpp @@ -9,33 +9,42 @@ namespace hydrogen // This synchronizes the additional SyncInfos to the "master". That // is, the execution streams described by the "others" will wait // for the "master" stream. -template -void AddSynchronizationPoint( - SyncInfo const& master, - SyncInfo const&... others) +template +void AddSynchronizationPoint(SyncInfo const &master, + SyncInfo const &other, + SyncInfo const &...others) { - AddSynchronizationPoint(master); +#ifdef HYDROGEN_HAVE_GPU + if constexpr (D == Device::GPU && D == D2) { + // When the streams are the same, there is no need to create + // synchronization points. Skip "other" call recursively with the rest. + if (master.Stream() == other.Stream()) + { + if constexpr (sizeof...(others) > 0UL) + AddSynchronizationPoint(master, others...); + return; + } + } +#endif // HYDROGEN_HAVE_GPU - int dummy[] = { (details::AddSyncPoint(master, others), 0)... }; - (void) dummy; + AddSynchronizationPoint(master); + int dummy[] = {(details::AddSyncPoint(master, other), 0), + (details::AddSyncPoint(master, others), 0)...}; + (void)dummy; } template -void AllWaitOnMaster( - SyncInfo const& master, SyncInfo const&... others) +void AllWaitOnMaster(SyncInfo const &master, SyncInfo const &...others) { AddSynchronizationPoint(master, others...); } template -void MasterWaitOnAll( - SyncInfo const& master, - SyncInfo const&... others) +void MasterWaitOnAll(SyncInfo const &master, SyncInfo const &...others) { - int dummy[] = { - (AddSynchronizationPoint(others, master), 0)...}; - (void) dummy; + int dummy[] = {(AddSynchronizationPoint(others, master), 0)...}; + (void)dummy; } -}// namespace hydrogen +} // namespace hydrogen #endif // HYDROGEN_SYNCHRONIZEAPI_HPP_ diff --git a/include/hydrogen/blas/GPU_BLAS_impl.hpp b/include/hydrogen/blas/GPU_BLAS_impl.hpp index e2f14acd28..3d48277654 100644 --- a/include/hydrogen/blas/GPU_BLAS_impl.hpp +++ b/include/hydrogen/blas/GPU_BLAS_impl.hpp @@ -48,6 +48,25 @@ namespace gpu_lapack_impl = hydrogen::cusolver; namespace gpu_blas_impl = hydrogen::rocblas; namespace gpu_lapack_impl = hydrogen::rocsolver; +#if defined(HYDROGEN_GPU_USE_FP16) +#if defined(HYDROGEN_HAVE_HALF) +template <> +struct hydrogen::Caster<__half, rocblas_half> +{ + static rocblas_half Cast(__half const& x) + { + return *(reinterpret_cast(&x)); + } +}; +#else +template <> +inline rocblas_half hydrogen::To(__half const& x) +{ + return *(reinterpret_cast(&x)); +} +#endif +#endif + #else #pragma GCC error "LOGIC ERROR: No GPU programming model enabled." #endif @@ -76,13 +95,14 @@ void AxpyImpl(SizeT size, T const& alpha, T* Y, SizeT incy, SyncInfo const& si) { - using NTP = MakePointer>; - using CNTP = MakePointerToConst>; + using NT = NativeType; + using NTP = MakePointer; + using CNTP = MakePointerToConst; SyncManager mgr(GetLibraryHandle(), si); gpu_blas_impl::Axpy( GetLibraryHandle(), - ToSizeT(size), alpha, + ToSizeT(size), hydrogen::To(alpha), reinterpret_cast(X), ToSizeT(incx), reinterpret_cast(Y), ToSizeT(incy)); } @@ -406,8 +426,9 @@ void GemmImpl( T* C, SizeT ldc, SyncInfo const& si) { - using NTP = MakePointer>; - using CNTP = MakePointerToConst>; + using NT = NativeType; + using NTP = MakePointer; + using CNTP = MakePointerToConst; SyncManager mgr(GetLibraryHandle(), si); gpu_blas_impl::Gemm( @@ -415,10 +436,10 @@ void GemmImpl( ToNativeTransposeMode(transA), ToNativeTransposeMode(transB), ToSizeT(m), ToSizeT(n), ToSizeT(k), - alpha, + To(alpha), reinterpret_cast(A), ToSizeT(lda), reinterpret_cast(B), ToSizeT(ldb), - beta, + To(beta), reinterpret_cast(C), ToSizeT(ldc)); } @@ -444,10 +465,10 @@ void GemmStridedBatchedImpl( ToNativeTransposeMode(transpA), ToNativeTransposeMode(transpB), ToSizeT(m), ToSizeT(n), ToSizeT(k), - &alpha, + reinterpret_cast(&alpha), reinterpret_cast(A), ToSizeT(lda), ToSizeT(strideA), reinterpret_cast(B), ToSizeT(ldb), ToSizeT(strideB), - &beta, + reinterpret_cast(&beta), reinterpret_cast(C), ToSizeT(ldc), ToSizeT(strideC), ToSizeT(batchCount)); } @@ -660,8 +681,9 @@ void GemvImpl( T* y, SizeT incy, SyncInfo const& si) { - using NTP = MakePointer>; - using CNTP = MakePointerToConst>; + using NT = NativeType; + using NTP = MakePointer; + using CNTP = MakePointerToConst; if (incy != SizeT(1)) throw std::runtime_error("incy must be 1 right now. " @@ -682,10 +704,10 @@ void GemvImpl( GetLibraryHandle(), ToNativeTransposeMode(ATrans), ToNativeTransposeMode(BTrans), ToSizeT(m), ToSizeT(n), ToSizeT(k), - alpha, + To(alpha), reinterpret_cast(A), ToSizeT(lda), reinterpret_cast(x), ToSizeT(LDB), - beta, + To(beta), reinterpret_cast(y), ToSizeT(LDC)); } diff --git a/include/hydrogen/device/gpu/CUB.hpp b/include/hydrogen/device/gpu/CUB.hpp index 75f10ad167..a328948ec5 100644 --- a/include/hydrogen/device/gpu/CUB.hpp +++ b/include/hydrogen/device/gpu/CUB.hpp @@ -2,23 +2,12 @@ #define HYDROGEN_IMPORTS_CUB_HPP_ #include "El/hydrogen_config.h" - -#ifdef HYDROGEN_HAVE_CUDA -#include -#include -#elif defined HYDROGEN_HAVE_ROCM -#include -#endif // HYDROGEN_HAVE_CUB +#include "hydrogen/PoolAllocator.hpp" namespace hydrogen { namespace cub { -#ifdef HYDROGEN_HAVE_CUDA -namespace cub_impl = ::cub; -#elif defined HYDROGEN_HAVE_ROCM -namespace cub_impl = ::hipcub; -#endif // HYDROGEN_HAVE_CUDA /** @brief Get singleton instance of CUB memory pool. * @@ -26,19 +15,30 @@ namespace cub_impl = ::hipcub; * already. The following environment variables are used to * control the construction of the memory pool: * - * - H_CUB_BIN_GROWTH: The growth factor. (Default: 2) - * - H_CUB_MIN_BIN: The minimum bin. (Default: 1) - * - H_CUB_MAX_BIN: The maximum bin. (Default: no max bin) - * - H_CUB_MAX_CACHED_SIZE: The maximum aggregate cached bytes + * - H_MEMPOOL_BIN_GROWTH: The growth factor. (Default: 2) + * - H_MEMPOOL_MIN_BIN: The minimum bin. (Default: 1) + * - H_MEMPOOL_MAX_BIN: The maximum bin. (Default: no max bin) + * - H_MEMPOOL_MAX_CACHED_SIZE: The maximum aggregate cached bytes * per device. (Default: No maximum) - * - H_CUB_DEBUG: If nonzero, allow CUB to print debugging output. + * - H_MEMPOOL_MAX_BIN_ALLOC_SIZE: The maximum cached bytes per + * allocation. (Default: No maximum) + * - H_MEMPOOL_BIN_MULT_THRESHOLD: The difference between two consecutive + * geometric bins from which linear binning is used. (Default: No + * linear binning) + * - H_MEMPOOL_BIN_MULT: The multiplier for linear bin sizes. (Default: + * No linear binning) + * - H_MEMPOOL_BIN_SIZES: Custom set of (comma-separated) bin sizes in + * bytes. (Default: None) + * - H_MEMPOOL_MALLOCASYNC: If nonzero, uses mallocAsync as the backend + * for non-binned allocation. (Default: 0) + * - H_MEMPOOL_DEBUG: If nonzero, prints debugging output. * * Note that if debugging output is turned on, there is no * synchronization across processes. Users should take care to * redirect output on a per-rank basis, either through the * features exposed by their MPI launcher or by some other means. */ - cub_impl::CachingDeviceAllocator& MemoryPool(); + PooledDeviceAllocator& MemoryPool(); /** Destroy singleton instance of CUB memory pool. */ void DestroyMemoryPool(); diff --git a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp index 7e1328ff51..5899b16416 100644 --- a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp +++ b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include // Helper error-checking macro. #define H_CHECK_ROCBLAS(cmd) \ diff --git a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp index 6141694dab..27b3bb90f6 100644 --- a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp +++ b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include namespace hydrogen { diff --git a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp index e4838dd8ef..3316834b48 100644 --- a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp +++ b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include namespace hydrogen { @@ -41,14 +41,9 @@ struct NativeTypeT> { using type = rocblas_double_complex; // Half precision requires conversion as well #ifdef HYDROGEN_GPU_USE_FP16 -template <> struct NativeTypeT { using type = rocblas_half; }; +template <> struct NativeTypeT { using type = rocblas_half; }; #ifdef HYDROGEN_HAVE_HALF template <> struct NativeTypeT { using type = rocblas_half; }; -template <> -struct NativeTypeT> -{ - using type = rocblas_half_complex; -}; #endif // HYDROGEN_HAVE_HALF #endif // HYDROGEN_GPU_USE_FP16 @@ -118,10 +113,18 @@ struct IsSupportedType_Base template <> struct IsSupportedType_Base : std::true_type {}; template <> +struct IsSupportedType_Base : std::true_type {}; +template <> struct IsSupportedType_Base : std::true_type {}; template <> struct IsSupportedType_Base : std::true_type {}; +// support via rocblas_nrm2_ex +template <> +struct IsSupportedType_Base : std::true_type {}; +// support via rocblas_scal_ex +template <> +struct IsSupportedType_Base : std::true_type {}; #endif // HYDROGEN_GPU_USE_FP16 /** @class IsSupportedType diff --git a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp index 319b287635..fb09d48e68 100644 --- a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp +++ b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp @@ -3,7 +3,7 @@ #include -#include +#include namespace hydrogen { diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp index 8dd788050a..ec17f0f631 100644 --- a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp +++ b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp @@ -3,7 +3,7 @@ #include -#include +#include namespace hydrogen { @@ -51,9 +51,15 @@ ADD_AXPY_DECL(double); ADD_COPY_DECL(float); ADD_COPY_DECL(double); +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_DOT_DECL(rocblas_half); +#endif // HYDROGEN_GPU_USE_FP16 ADD_DOT_DECL(float); ADD_DOT_DECL(double); +#ifdef HYDROGEN_GPU_USE_FP16 +ADD_NRM2_DECL(rocblas_half); +#endif // HYDROGEN_GPU_USE_FP16 ADD_NRM2_DECL(float); ADD_NRM2_DECL(double); diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp index d4a37b29ff..4fdbec7e5a 100644 --- a/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp +++ b/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp @@ -4,7 +4,7 @@ #include #include "rocBLASManagement.hpp" -#include +#include namespace hydrogen { diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp index cc5c648523..3c1b5cea24 100644 --- a/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp +++ b/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include "rocBLASMeta.hpp" diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp index 7e088fdcf8..3f47f00103 100644 --- a/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp +++ b/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp @@ -3,7 +3,7 @@ #include -#include +#include namespace hydrogen { diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp index 8315d8b9f1..d0b8199216 100644 --- a/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp +++ b/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp @@ -3,7 +3,7 @@ #include -#include +#include namespace hydrogen { namespace rocsolver { diff --git a/include/hydrogen/utils/HalfPrecision.hpp b/include/hydrogen/utils/HalfPrecision.hpp index 72c9c7918f..123b1287ea 100644 --- a/include/hydrogen/utils/HalfPrecision.hpp +++ b/include/hydrogen/utils/HalfPrecision.hpp @@ -46,7 +46,11 @@ struct is_arithmetic : true_type {}; #endif // Now include the actual Half library. +#if __has_include() // E.g., the one that ships with ROCm +#include +#else #include +#endif // Declare the hydrogen typedef namespace hydrogen @@ -94,19 +98,24 @@ inline hydrogen::cpu_half_type operator^(hydrogen::cpu_half_type const&, #endif // HYDROGEN_HAVE_HALF // Finally, do the GPU stuff -#ifdef HYDROGEN_GPU_USE_FP16 +#if defined HYDROGEN_HAVE_GPU && defined HYDROGEN_GPU_USE_FP16 // Grab the right header #if defined(HYDROGEN_HAVE_CUDA) #include -#elif defined(HYDROGEN_HAVE_AMDGPU) -#include +// Needed for CUDA_VERSION +#include +#elif defined(HYDROGEN_HAVE_ROCM) +#include #endif // HYDROGEN_HAVE_CUDA namespace hydrogen { -#if defined(HYDROGEN_HAVE_CUDA) +#if defined(__HIP_NO_HALF_CONVERSIONS__) +static_assert(false, "__HIP_NO_HALF_CONVERSIONS__ is defined."); +#endif + /** @brief Unified name for the FP16 type on GPU */ using gpu_half_type = __half; @@ -118,14 +127,9 @@ struct TypeTraits static std::string Name() { return typeid(gpu_half_type).name(); } };// struct TypeTraits -#elif defined(HYDROGEN_HAVE_AMDGPU) -/** @brief Unified name for the FP16 type on GPU */ -using gpu_half_type = rocblas_half; -#endif // HYDROGEN_HAVE_CUDA - }// namespace hydrogen -#if defined(HYDROGEN_HAVE_CUDA) && !defined(__CUDACC__) +#if defined(HYDROGEN_HAVE_ROCM) || (defined(HYDROGEN_HAVE_CUDA) && !(defined(__CUDACC__)) && (CUDA_VERSION < 12020)) /** @brief Enable "update" functionality for __half. */ template @@ -198,12 +202,49 @@ inline hydrogen::gpu_half_type operator-( return -float(val); } -#endif // defined(HYDROGEN_HAVE_CUDA) && !defined(__CUDACC__) +#if defined(HYDROGEN_HAVE_ROCM) +inline bool operator<( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return float(x) < float(y); +} + +inline bool operator>( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return float(x) > float(y); +} + +inline bool operator<=( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return float(x) <= float(y); +} + +inline bool operator>=( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return float(x) >= float(y); +} + +inline bool operator==( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return float(x) == float(y); +} + +inline bool operator!=( + hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y) +{ + return !(x == y); +} +#endif // defined(HYDROGEN_HAVE_ROCM) +#endif // defined(HYDROGEN_HAVE_ROCM) || (defined(HYDROGEN_HAVE_CUDA) && !(defined(__CUDACC__)) && (CUDA_VERSION < 12020)) inline std::ostream& operator<<(std::ostream& os, hydrogen::gpu_half_type const& x) { return os << float(x) << "_h"; } -#endif // HYDROGEN_GPU_USE_FP16 +#endif // defined HYDROGEN_HAVE_GPU && defined HYDROGEN_GPU_USE_FP16 #endif // HYDROGEN_UTILS_HALFPRECISION_HPP_ diff --git a/include/hydrogen/utils/NumericTypeConversion.hpp b/include/hydrogen/utils/NumericTypeConversion.hpp index b30f4c3e32..aafddc0e8e 100644 --- a/include/hydrogen/utils/NumericTypeConversion.hpp +++ b/include/hydrogen/utils/NumericTypeConversion.hpp @@ -15,7 +15,11 @@ */ #ifdef HYDROGEN_HAVE_HALF +#if __has_include() // E.g., the one that ships with ROCm +#include +#else #include +#endif #endif // HYDROGEN_HAVE_HALF namespace hydrogen @@ -59,6 +63,17 @@ struct Caster return static_cast<__half>(x); } }; + +#ifdef HYDROGEN_HAVE_ROCM +template <> +struct Caster<__half, double> +{ + static double Cast(__half const& x) + { + return float(x); + } +}; +#endif // HYDROGEN_HAVE_ROCM #endif // HYDROGEN_GPU_USE_FP16 template diff --git a/src/core/MemoryPool.cpp b/src/core/MemoryPool.cpp index 1b5058f651..3e81fd70e4 100644 --- a/src/core/MemoryPool.cpp +++ b/src/core/MemoryPool.cpp @@ -4,6 +4,29 @@ namespace El { +bool details::debug_mempool() noexcept +{ + char const* const env = std::getenv("H_MEMPOOL_DEBUG"); + return (env && std::strlen(env) && env[0] != '0'); +} + +float details::default_mempool_bin_growth() noexcept +{ + char const* const env = std::getenv("H_MEMPOOL_BIN_GROWTH"); + return (env ? std::stof(env) : 1.6); +} + +size_t details::default_mempool_min_bin() noexcept +{ + char const* const env = std::getenv("H_MEMPOOL_MIN_BIN"); + return (env ? std::stoull(env) : 1UL); +} + +size_t details::default_mempool_max_bin() noexcept +{ + char const* const env = std::getenv("H_MEMPOOL_MAX_BIN"); + return (env ? std::stoull(env) : (1 << 26)); +} namespace { diff --git a/src/core/Profiling.cpp b/src/core/Profiling.cpp index 72d40349b1..27fa5323b5 100644 --- a/src/core/Profiling.cpp +++ b/src/core/Profiling.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "El/hydrogen_config.h" #include "El/core/Profiling.hpp" @@ -11,6 +12,10 @@ #include "cuda_runtime.h" #endif // HYDROGEN_HAVE_NVPROF +#ifdef HYDROGEN_HAVE_ROCTRACER +#include "roctx.h" +#endif + #ifdef HYDROGEN_HAVE_VTUNE #include #endif // HYDROGEN_HAVE_VTUNE @@ -56,6 +61,13 @@ bool nvprof_runtime_enabled = true; bool NVProfRuntimeEnabled() noexcept { return nvprof_runtime_enabled; } #endif +// Some variables for roctx +#ifdef HYDROGEN_HAVE_ROCTRACER +bool roctx_runtime_enabled = true; + +bool roctxRuntimeEnabled() noexcept { return roctx_runtime_enabled; } +#endif + }// namespace void EnableVTune() noexcept @@ -72,6 +84,13 @@ void EnableNVProf() noexcept #endif // HYDROGEN_HAVE_NVPROF } +void EnableROCTX() noexcept +{ +#ifdef HYDROGEN_HAVE_ROCTRACER + roctx_runtime_enabled = true; +#endif // HYDROGEN_HAVE_ROCTRACER +} + void DisableVTune() noexcept { #ifdef HYDROGEN_HAVE_VTUNE @@ -86,6 +105,13 @@ void DisableNVProf() noexcept #endif // HYDROGEN_HAVE_NVPROF } +void DisableROCTX() noexcept +{ +#ifdef HYDROGEN_HAVE_ROCTRACER + roctx_runtime_enabled = false; +#endif // HYDROGEN_HAVE_ROCTRACER +} + Color GetNextProfilingColor() noexcept { auto id = current_color.fetch_add(1, std::memory_order_relaxed); @@ -94,6 +120,11 @@ Color GetNextProfilingColor() noexcept void BeginRegionProfile(char const* s, Color c) noexcept { +#ifdef HYDROGEN_HAVE_ROCTRACER + if (roctxRuntimeEnabled()) + roctxRangePush(s); +#endif // HYDROGEN_HAVE_ROCTRACER + #ifdef HYDROGEN_HAVE_NVPROF if (NVProfRuntimeEnabled()) { @@ -130,6 +161,11 @@ void BeginRegionProfile(char const* s, Color c) noexcept void EndRegionProfile(const char *) noexcept { +#ifdef HYDROGEN_HAVE_ROCTRACER + if (roctxRuntimeEnabled()) + roctxRangePop(); +#endif // HYDROGEN_HAVE_ROCTRACER + #ifdef HYDROGEN_HAVE_NVPROF if (NVProfRuntimeEnabled()) nvtxRangePop(); diff --git a/src/core/environment.cpp b/src/core/environment.cpp index fc0db9f499..da06003e70 100644 --- a/src/core/environment.cpp +++ b/src/core/environment.cpp @@ -341,6 +341,11 @@ void Finalize() if( mpi::Finalized() ) cerr << "Warning: MPI was finalized before Elemental." << endl; + + // Reset the global comms + const_cast(El::mpi::COMM_SELF).Reset(); + const_cast(El::mpi::COMM_WORLD).Reset(); + if( ::numElemInits == 0 ) { delete ::args; diff --git a/src/core/imports/cub.cpp b/src/core/imports/cub.cpp index 61fb2ca6bc..60164ff932 100644 --- a/src/core/imports/cub.cpp +++ b/src/core/imports/cub.cpp @@ -1,75 +1,93 @@ #include "hydrogen/device/gpu/CUB.hpp" #include +#include +#include -namespace hydrogen -{ -namespace cub -{ -namespace -{ - -unsigned int get_env_uint(char const* env_var_name, - unsigned int default_value = 0U) noexcept -{ - char const* env = std::getenv(env_var_name); - return (env - ? static_cast(std::stoi(env)) - : default_value); +namespace hydrogen { +namespace cub { +namespace { + +unsigned int get_env_uint(char const *env_var_name, + unsigned int default_value = 0U) noexcept { + char const *env = std::getenv(env_var_name); + return (env ? static_cast(std::stoi(env)) : default_value); +} + +unsigned int get_bin_growth() noexcept { + return get_env_uint("H_MEMPOOL_BIN_GROWTH", 2U); +} + +unsigned int get_min_bin() noexcept { + return get_env_uint("H_MEMPOOL_MIN_BIN", 1U); +} + +unsigned int get_max_bin() noexcept { + return get_env_uint("H_MEMPOOL_MAX_BIN", PooledDeviceAllocator::INVALID_BIN); } -unsigned int get_bin_growth() noexcept -{ - return get_env_uint("H_CUB_BIN_GROWTH", 2U); +size_t get_max_cached_size() noexcept { + char const *env = std::getenv("H_MEMPOOL_MAX_CACHED_SIZE"); + return (env ? static_cast(std::stoul(env)) + : PooledDeviceAllocator::INVALID_SIZE); } -unsigned int get_min_bin() noexcept -{ - return get_env_uint("H_CUB_MIN_BIN", 1U); +size_t get_max_bin_alloc_size() noexcept { + char const *env = std::getenv("H_MEMPOOL_MAX_BIN_ALLOC_SIZE"); + return (env ? static_cast(std::stoul(env)) + : PooledDeviceAllocator::INVALID_SIZE); } -unsigned int get_max_bin() noexcept -{ - return get_env_uint("H_CUB_MAX_BIN", - cub_impl::CachingDeviceAllocator::INVALID_BIN); +unsigned int get_bin_mult_threshold() noexcept { + return get_env_uint("H_MEMPOOL_BIN_MULT_THRESHOLD", + PooledDeviceAllocator::INVALID_BIN); } -size_t get_max_cached_size() noexcept -{ - char const* env = std::getenv("H_CUB_MAX_CACHED_SIZE"); - return (env - ? static_cast(std::stoul(env)) - : cub_impl::CachingDeviceAllocator::INVALID_SIZE); +unsigned int get_bin_mult() noexcept { + return get_env_uint("H_MEMPOOL_BIN_MULT", PooledDeviceAllocator::INVALID_BIN); } -bool get_debug() noexcept -{ - char const* env = std::getenv("H_CUB_DEBUG"); - return (env - ? static_cast(std::stoi(env)) - : false); +bool get_debug() noexcept { + char const *env = std::getenv("H_MEMPOOL_DEBUG"); + return (env ? static_cast(std::stoi(env)) : false); +} + +bool get_malloc_async() noexcept { + char const *env = std::getenv("H_MEMPOOL_MALLOCASYNC"); + return (env ? static_cast(std::stoi(env)) : false); +} + +std::set get_bin_sizes() noexcept { + std::set result; + char const *env = std::getenv("H_MEMPOOL_BIN_SIZES"); + if (!env) + return result; + + std::string envstr{env}; + std::istringstream iss{envstr}; + std::string elem; + while(std::getline(iss, elem, ',')) { + result.insert(std::stoull(elem)); + } + return result; } /** Singleton instance of CUB memory pool. */ -std::unique_ptr memoryPool_; -} // namespace - -cub_impl::CachingDeviceAllocator& MemoryPool() -{ - if (!memoryPool_) - memoryPool_.reset( - new cub_impl::CachingDeviceAllocator( - get_bin_growth(), - get_min_bin(), - get_max_bin(), - get_max_cached_size(), - /*skip_cleanup=*/false, - get_debug())); - return *memoryPool_; +std::unique_ptr memoryPool_; +} // namespace + +PooledDeviceAllocator &MemoryPool() { + if (!memoryPool_) + memoryPool_.reset(new PooledDeviceAllocator( + get_bin_growth(), get_min_bin(), get_max_bin(), get_max_cached_size(), + /*skip_cleanup=*/false, get_debug(), get_bin_mult_threshold(), + get_bin_mult(), get_max_bin_alloc_size(), get_bin_sizes(), + get_malloc_async())); + + return *memoryPool_; } -void DestroyMemoryPool() -{ memoryPool_.reset(); } +void DestroyMemoryPool() { memoryPool_.reset(); } -} // namespace CUBMemoryPool +} // namespace cub } // namespace hydrogen diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 9d724f516e..a05385d758 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -595,15 +595,24 @@ EL_NO_RELEASE_EXCEPT { EL_DEBUG_CSE; + if constexpr (IsAluminumSupported::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Send(buf, count, to, comm.template GetComm(syncInfo)); + } + else + { #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS - ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo); + ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo); #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS - Synchronize(syncInfo); + Synchronize(syncInfo); - EL_CHECK_MPI_CALL( - MPI_Send( - buf, count, TypeMap(), to, tag, comm.GetMPIComm())); + EL_CHECK_MPI_CALL( + MPI_Send( + buf, count, TypeMap(), to, tag, comm.GetMPIComm())); + } } template ; + if constexpr (IsAluminumSupported::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Send(buf, count, to, comm.template GetComm(syncInfo)); + } + else if constexpr (IsAluminumSupported::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Send((Real*)buf, 2*count, to, + comm.template GetComm(syncInfo)); + } + else { #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS - ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo); + ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo); #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS - Synchronize(syncInfo); + Synchronize(syncInfo); #ifdef EL_AVOID_COMPLEX_MPI - EL_CHECK_MPI_CALL( - MPI_Send( - buf, 2*count, TypeMap(), to, tag, comm.GetMPIComm())); + EL_CHECK_MPI_CALL( + MPI_Send( + buf, 2*count, TypeMap(), to, tag, comm.GetMPIComm())); #else - EL_CHECK_MPI_CALL( - MPI_Send( - buf, count, TypeMap>(), to, tag, comm.GetMPIComm())); + EL_CHECK_MPI_CALL( + MPI_Send( + buf, count, TypeMap>(), to, tag, comm.GetMPIComm())); #endif + } } template ::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Recv(buf, count, from, comm.template GetComm(syncInfo)); + } + else + { #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS - ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo); + ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo); #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS - Synchronize(syncInfo); - Status status; - EL_CHECK_MPI_CALL( - MPI_Recv( - buf, count, TypeMap(), from, tag, comm.GetMPIComm(), &status)); + Synchronize(syncInfo); + Status status; + EL_CHECK_MPI_CALL( + MPI_Recv( + buf, count, TypeMap(), from, tag, comm.GetMPIComm(), &status)); + } } template * buf, int count, int from, int tag, Comm const& c EL_NO_RELEASE_EXCEPT { EL_DEBUG_CSE; + using CT = std::complex; + if constexpr (IsAluminumSupported::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Recv(buf, count, from, comm.template GetComm(syncInfo)); + } + else if constexpr (IsAluminumSupported::value) + { + (void) tag; // Al doesn't use tags. + using BE = BestBackend; + Al::Recv((Real*) buf, 2*count, from, + comm.template GetComm(syncInfo)); + } + else + { #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS - ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo); + ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo); #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS - Synchronize(syncInfo); + Synchronize(syncInfo); - Status status; + Status status; #ifdef EL_AVOID_COMPLEX_MPI - EL_CHECK_MPI_CALL( - MPI_Recv( - buf, 2*count, TypeMap(), from, tag, comm.GetMPIComm(), &status)); + EL_CHECK_MPI_CALL( + MPI_Recv( + buf, 2*count, TypeMap(), from, tag, comm.GetMPIComm(), &status)); #else - EL_CHECK_MPI_CALL( - MPI_Recv( - buf, count, TypeMap>(), - from, tag, comm.GetMPIComm(), &status)); + EL_CHECK_MPI_CALL( + MPI_Recv( + buf, count, TypeMap>(), + from, tag, comm.GetMPIComm(), &status)); #endif + } } template (syncInfo)); } +namespace { + +template struct BackendTag {}; + +template +void do_copy_n(T const* src, int count, T* dst, SyncInfo const&) +{ + std::copy_n(src, count, dst); +} + +#ifdef HYDROGEN_HAVE_GPU +template +void do_copy_n(T const* src, int count, T* dst, SyncInfo const& si) +{ + gpu::Copy1DIntraDevice(src, dst, count, si); +} +#endif // HYDROGEN_HAVE_GPU + +template +void SafeInPlaceSendRecv(T* buf, int count, int to, int from, Comm const& comm, + SyncInfo const& syncInfo, BackendTag) +{ + hydrogen::simple_buffer tmp_recv_buf(count, syncInfo); + Al::SendRecv( + buf, count, to, tmp_recv_buf.data(), count, from, + comm.template GetComm(syncInfo)); + do_copy_n(tmp_recv_buf.data(), count, buf, syncInfo); +} + +#ifdef HYDROGEN_HAVE_AL_HOST_XFER +template +void SafeInPlaceSendRecv(T* buf, int count, int to, int from, Comm const& comm, + SyncInfo const& syncInfo, + BackendTag) +{ + // This is ok due to implementation details of how the + // HostTransfer backend works. + using Backend = Al::HostTransferBackend; + Al::SendRecv( + buf, count, to, buf, count, from, + comm.template GetComm(syncInfo)); +} +#endif +} // namespace + template >*/> void SendRecv(T* buf, int count, int to, int from, Comm const& comm, SyncInfo const& syncInfo) { EL_DEBUG_CSE; - using Backend = BestBackend; - // Not sure if Al is ok with this bit + +#ifdef HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV Al::SendRecv( - buf, count, to, buf, count, from, + buf, count, to, from, comm.template GetComm(syncInfo)); +#else + SafeInPlaceSendRecv(buf, count, to, from, comm, syncInfo, + BackendTag{}); +#endif } #endif // HYDROGEN_HAVE_ALUMINUM diff --git a/src/hydrogen/device/GPU.cpp b/src/hydrogen/device/GPU.cpp index 300e39eada..93b0ac25ae 100644 --- a/src/hydrogen/device/GPU.cpp +++ b/src/hydrogen/device/GPU.cpp @@ -40,6 +40,7 @@ int ComputeDeviceId(unsigned int device_count) noexcept // TODO: Use HWLOC or something to pick "closest GPU" int local_rank = 0; char* env = nullptr; + if (!env) { env = std::getenv("FLUX_TASK_LOCAL_ID"); } if (!env) { env = std::getenv("SLURM_LOCALID"); } if (!env) { env = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); } if (!env) { env = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); } diff --git a/src/hydrogen/device/ROCm.cpp b/src/hydrogen/device/ROCm.cpp index db1592184c..19f37d270a 100644 --- a/src/hydrogen/device/ROCm.cpp +++ b/src/hydrogen/device/ROCm.cpp @@ -75,7 +75,13 @@ hipStream_t GetNewStream() hipEvent_t GetNewEvent() { hipEvent_t event; +#if HIP_VERSION < 50600000 H_CHECK_HIP(hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#else + H_CHECK_HIP(hipEventCreateWithFlags( + &event, + hipEventDisableTiming | hipEventDisableSystemFence)); +#endif return event; } diff --git a/src/hydrogen/device/cuBLAS_API.cpp b/src/hydrogen/device/cuBLAS_API.cpp index fd6714bad5..59377159f8 100644 --- a/src/hydrogen/device/cuBLAS_API.cpp +++ b/src/hydrogen/device/cuBLAS_API.cpp @@ -167,7 +167,6 @@ using RealType = typename RealTypeT::type; #define ADD_NRM2_IMPL(ScalarType, TypeChar) \ void Nrm2(cublasHandle_t handle, \ int n, ScalarType const* X, int incx, \ - ScalarType const* Y, int incy, \ RealType* output) \ { \ H_CHECK_CUBLAS( \ diff --git a/src/hydrogen/device/rocBLAS_API.cpp b/src/hydrogen/device/rocBLAS_API.cpp index 2c8fa4f783..75c5ddcdb6 100644 --- a/src/hydrogen/device/rocBLAS_API.cpp +++ b/src/hydrogen/device/rocBLAS_API.cpp @@ -1,14 +1,112 @@ #include +#include #include #include -#include +#include namespace hydrogen { namespace rocblas { +namespace +{ +/** @brief Manage host-pointer result semantics. + * + * If the memory mode on the handle is already set to DEVICE, this is + * just a passthrough. Otherwise, we grab memory from CUB (fallback + * to the HIP stream-aware allocator) and use that. + */ +template +class ResultMgr +{ +public: + ResultMgr(rocblas_handle handle, T* result_ptr) + : handle_{handle}, + result_{result_ptr}, + device_{nullptr} + { + rocblas_pointer_mode current_ptr_mode; + H_CHECK_ROCBLAS(rocblas_get_pointer_mode(handle_, ¤t_ptr_mode)); + if (current_ptr_mode == rocblas_pointer_mode_host) + { + hipStream_t stream; + H_CHECK_ROCBLAS(rocblas_get_stream(handle_, &stream)); +#ifdef HYDROGEN_HAVE_CUB + H_CHECK_HIP(cub::MemoryPool().DeviceAllocate( + reinterpret_cast(&device_), + sizeof(T), + stream)); +#else + H_CHECK_HIP(hipMallocAsync( + reinterpret_cast(&device_), + sizeof(T), + stream)); +#endif // HYDROGEN_HAVE_CUB + + // Now set the pointer mode + H_CHECK_ROCBLAS( + rocblas_set_pointer_mode(handle_, + rocblas_pointer_mode_device)); + } + } + + ~ResultMgr() + { + try + { + if (device_) + { + hipStream_t stream; + H_CHECK_ROCBLAS(rocblas_get_stream(handle_, &stream)); + + // Copy asyncly to host + H_CHECK_HIP(hipMemcpyAsync(result_, + device_, + sizeof(T), + hipMemcpyDeviceToHost, + stream)); + + // Clean up device memory +#ifdef HYDROGEN_HAVE_CUB + H_CHECK_HIP(cub::MemoryPool().DeviceFree(device_)); +#else + H_CHECK_HIP(hipFreeAsync(device_, stream)); +#endif // HYDROGEN_HAVE_CUB + + // Sync stream to match cuBLAS behavior (cuBLAS docs here: + // https://docs.nvidia.com/cuda/cublas/#scalar-parameters) + H_CHECK_HIP(hipStreamSynchronize(stream)); + + // Reset pointer mode + H_CHECK_ROCBLAS(rocblas_set_pointer_mode(handle_, rocblas_pointer_mode_host)); + } + } + catch (std::exception const& e) + { + std::cerr << "Caught exception in dtor:\n\n " << e.what() << "\n\nTerminating." + << std::endl; + std::terminate(); + } + } + + T* get() noexcept { return (device_ ? device_ : result_); } +private: + rocblas_handle handle_ = nullptr; + T* result_ = nullptr; + T* device_ = nullptr; +}; // struct ResultMgr + +template +auto manage_result(rocblas_handle handle, T* result_ptr) +{ + if (!result_ptr) + throw GPUError("result_ptr cannot be null"); + return ResultMgr{handle, result_ptr}; +}; + +} // // BLAS 1 @@ -42,23 +140,25 @@ namespace rocblas rocblas_int n, \ ScalarType const* X, rocblas_int incx, \ ScalarType const* Y, rocblas_int incy, \ - ScalarType* result) \ + ScalarType* result_in) \ { \ + auto result = manage_result(handle, result_in); \ H_CHECK_ROCBLAS( \ rocblas_ ## TypeChar ## dot( \ handle, \ - n, X, incx, Y, incy, result)); \ + n, X, incx, Y, incy, result.get())); \ } #define ADD_NRM2_IMPL(ScalarType, TypeChar) \ void Nrm2(rocblas_handle handle, \ rocblas_int n, ScalarType const* X, rocblas_int incx, \ - ScalarType* result) \ + ScalarType* result_in) \ { \ + auto result = manage_result(handle, result_in); \ H_CHECK_ROCBLAS( \ rocblas_ ## TypeChar ## nrm2( \ handle, \ - n, X, incx, result)); \ + n, X, incx, result.get())); \ } #define ADD_SCALE_IMPL(ScalarType, TypeChar) \ @@ -243,14 +343,49 @@ ADD_AXPY_IMPL(double, d) ADD_COPY_IMPL(float, s) ADD_COPY_IMPL(double, d) -//ADD_DOT_IMPL(rocblas_half, h) +ADD_DOT_IMPL(rocblas_half, h) ADD_DOT_IMPL(float, s) ADD_DOT_IMPL(double, d) -//ADD_NRM2_IMPL(rocblas_half, h) +// ADD_NRM2_IMPL(rocblas_half, h) +void Nrm2(rocblas_handle handle, + rocblas_int n, + rocblas_half const* X, + rocblas_int incx, + rocblas_half* result_in) +{ + auto result = manage_result(handle, result_in); + H_CHECK_ROCBLAS( + rocblas_nrm2_ex( + handle, + n, + X, + rocblas_datatype_f16_r, + incx, + result.get(), + rocblas_datatype_f16_r, + rocblas_datatype_f32_r)); +} + ADD_NRM2_IMPL(float, s) ADD_NRM2_IMPL(double, d) +void Scale(rocblas_handle handle, + rocblas_int n, rocblas_half const& alpha, + rocblas_half* X, rocblas_int incx) +{ + H_CHECK_ROCBLAS( + rocblas_scal_ex( + handle, + n, + &alpha, + rocblas_datatype_f16_r, + X, + rocblas_datatype_f16_r, + incx, + rocblas_datatype_f32_r)); +} + ADD_SCALE_IMPL(float, s) ADD_SCALE_IMPL(double, d) @@ -320,21 +455,21 @@ ASSERT_SUPPORT(double, BLAS_Op::NRM2); ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED); #ifdef HYDROGEN_GPU_USE_FP16 -ASSERT_SUPPORT(rocblas_half, BLAS_Op::AXPY); -ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMM); -ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::SCAL); -ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::COPY); -ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::DGMM); -ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEAM); -ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEMV); -ASSERT_SUPPORT(rocblas_half, BLAS_Op::DOT); -ASSERT_SUPPORT(rocblas_half, BLAS_Op::NRM2); -ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMMSTRIDEDBATCHED); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::AXPY); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::GEMM); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::SCAL); +ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::COPY); +ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::DGMM); +ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::GEAM); +ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::GEMV); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::DOT); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::NRM2); +ASSERT_SUPPORT(gpu_half_type, BLAS_Op::GEMMSTRIDEDBATCHED); #ifdef HYDROGEN_HAVE_HALF ASSERT_SUPPORT(cpu_half_type, BLAS_Op::AXPY); ASSERT_SUPPORT(cpu_half_type, BLAS_Op::GEMM); -ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::SCAL); +ASSERT_SUPPORT(cpu_half_type, BLAS_Op::SCAL); ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::COPY); ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::DGMM); ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEAM); diff --git a/src/hydrogen/device/rocSOLVER_API.cpp b/src/hydrogen/device/rocSOLVER_API.cpp index ad57073294..229edadf8c 100644 --- a/src/hydrogen/device/rocSOLVER_API.cpp +++ b/src/hydrogen/device/rocSOLVER_API.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace hydrogen { namespace rocsolver { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 43dff24736..e9cc78c2ee 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,13 +11,20 @@ foreach (src_file ${SOURCES}) add_executable("${__test_name}" ${src_file}) target_link_libraries("${__test_name}" PRIVATE ${HYDROGEN_LIBRARIES}) + # FIXME: https://github.com/llnl/elemental/issues/159 + if (__test_name MATCHES "DifferentGridsGeneral") + continue() + endif () + # Create test add_test(NAME "${__test_name}.test" COMMAND "${__test_name}") - add_test(NAME "${__test_name}_mpi_np4.test" - COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPI_PREFLAGS} - $ ${MPI_POSTFLAGS}) + if (MPIEXEC) + add_test(NAME "${__test_name}_mpi_np4.test" + COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPI_PREFLAGS} + $ ${MPI_POSTFLAGS}) + endif () endforeach () set_target_properties(HermitianEig diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index 14dc5d4ca6..f1fc0a1bb6 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -7,6 +7,7 @@ if (HYDROGEN_HAVE_GPU) gpu_combine_test.cu gpu_entrywise_map_test.cu copy_cpu_gpu_test.cpp + pool_allocator_test.cpp ) if (HYDROGEN_HAVE_HALF AND HYDROGEN_GPU_USE_FP16) list(APPEND HYDROGEN_CATCH2_TEST_FILES diff --git a/unit_test/pool_allocator_test.cpp b/unit_test/pool_allocator_test.cpp new file mode 100644 index 0000000000..eb3c3521b4 --- /dev/null +++ b/unit_test/pool_allocator_test.cpp @@ -0,0 +1,75 @@ +#include + +#include "El.hpp" +#include "hydrogen/PoolAllocator.hpp" + +TEST_CASE("Testing hydrogen::PooledDeviceAllocator", "[memory][utils][gpu]") { + SECTION("Basic pool behavior and geometric binning") { + hydrogen::PooledDeviceAllocator alloc{/*bin_growth=*/2, + /*min_bin=*/1, /*max_bin=*/20}; + void *ptr; + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 201); + REQUIRE(alloc.TotalAllocatedMemory() == 256 * sizeof(float)); + CHECK(alloc.ExcessMemory() == 55 * sizeof(float)); + alloc.DeviceFree(ptr); + CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float)); + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 203); + CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float)); + alloc.DeviceFree(ptr); + + // Unbinned memory + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 1048576); + CHECK(alloc.TotalAllocatedMemory() == (256 + 1048576) * sizeof(float)); + alloc.DeviceFree(ptr); + CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float)); + } + SECTION("Linear binning test") { + hydrogen::PooledDeviceAllocator alloc{ + /*bin_growth=*/2, + /*min_bin=*/1, + /*max_bin=*/20, + /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE, + /*skip_cleanup=*/false, + /*debug=*/false, + /*bin_mult_threshold=*/1024 * sizeof(float), + /*bin_mult=*/6, + /*max_binned_alloc_size=*/4007 * sizeof(float)}; + + void *ptr; + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 201); + CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float)); + alloc.DeviceFree(ptr); + + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 3001); + CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float) + 12006); + alloc.DeviceFree(ptr); + + alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 4009); + CHECK(alloc.TotalAllocatedMemory() == + ((256 + 4009) * sizeof(float) + 12006)); + alloc.DeviceFree(ptr); + CHECK(alloc.TotalAllocatedMemory() == (256 * sizeof(float) + 12006)); + } + SECTION("Custom binning test") { + hydrogen::PooledDeviceAllocator alloc{ + /*bin_growth=*/2, + /*min_bin=*/1, + /*max_bin=*/20, + /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE, + /*skip_cleanup=*/false, + /*debug=*/false, + /*bin_mult_threshold=*/hydrogen::PooledDeviceAllocator::INVALID_BIN, + /*bin_mult=*/hydrogen::PooledDeviceAllocator::INVALID_BIN, + /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE, + /*bin_sizes=*/{1, 21, 39, 100}}; + + void *ptr; + alloc.DeviceAllocate(-1, &ptr, 32); + CHECK(alloc.TotalAllocatedMemory() == 39); + alloc.DeviceFree(ptr); + + alloc.DeviceAllocate(-1, &ptr, 201); + CHECK(alloc.TotalAllocatedMemory() == (39 + 256)); + alloc.DeviceFree(ptr); + } +}