diff --git a/CMakeLists.txt b/CMakeLists.txt
index 219abe707f..ecbe09895e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ endif (__GIT_EXECUTABLE)
 # This must be set because version tags
 set(HYDROGEN_VERSION_MAJOR 1)
 set(HYDROGEN_VERSION_MINOR 5)
-set(HYDROGEN_VERSION_PATCH 2)
+set(HYDROGEN_VERSION_PATCH 4)
 set(HYDROGEN_VERSION_MAJOR_MINOR
     "${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}")
 set(HYDROGEN_VERSION
@@ -101,10 +101,7 @@ if (NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE)
 endif ()
 
 # Feature-related options
-
-option(Hydrogen_ENABLE_ALUMINUM
-  "Enable the Aluminum package for improved device-side communication."
-  OFF)
+include(CMakeDependentOption)
 
 option(Hydrogen_ENABLE_CUDA
   "Search for CUDA support and enable related features if found."
@@ -114,6 +111,12 @@ option(Hydrogen_ENABLE_ROCM
   "Search for ROCm/HIP support and enable related features if found."
   OFF)
 
+cmake_dependent_option(Hydrogen_ENABLE_ROCTRACER
+  "Search for Roctracer and enable related features if found."
+  OFF
+  "Hydrogen_ENABLE_ROCM"
+  OFF)
+
 if (Hydrogen_ENABLE_ROCM AND Hydrogen_ENABLE_CUDA)
   message(FATAL_ERROR
     "ROCm and CUDA code paths are mutually exclusive. "
@@ -313,10 +316,6 @@ if (Hydrogen_ENABLE_CUDA)
     set(HYDROGEN_GPU_USE_TENSOR_OP_MATH TRUE)
   endif ()
 
-  if (Hydrogen_ENABLE_GPU_FP16)
-    set(HYDROGEN_GPU_USE_FP16 TRUE)
-  endif ()
-
   if (Hydrogen_ENABLE_CUB)
     if (CUDAToolkit_VERSION_MAJOR LESS 11)
       find_package(CUB MODULE REQUIRED)
@@ -358,74 +357,102 @@ if (Hydrogen_ENABLE_ROCM)
   find_package(rocsolver CONFIG REQUIRED)
   find_package(rocthrust CONFIG REQUIRED)
 
+  if (Hydrogen_ENABLE_ROCTRACER)
+    find_package(Roctracer MODULE COMPONENTS roctx)
+    set(HYDROGEN_HAVE_ROCTRACER ${Roctracer_FOUND})
+  endif ()
+
+  include(HydrogenCleanupHIPTargets)
+  h_clean_hip_targets()
+
   set(HYDROGEN_HAVE_ROCM TRUE)
   message(STATUS "Found ROCm/HIP toolchain. Using HIP/ROCm.")
 
   set(H_ROCM_CXX_LIBS
     hip::host
-    hip::hipcub
     roc::rocblas
     roc::rocsolver
-    roc::rocthrust)
-
+    roc::rocthrust
+    ${Roctracer_LIBRARIES})
+  if (HYDROGEN_HAVE_CUB)
+    list(APPEND H_ROCM_CXX_LIBS hip::hipcub)
+  endif ()
   set(H_ROCM_HIP_LIBS
     hip::device)
 endif (Hydrogen_ENABLE_ROCM)
 
 if (HYDROGEN_HAVE_CUDA OR HYDROGEN_HAVE_ROCM)
   set(HYDROGEN_HAVE_GPU TRUE)
+
+  if (Hydrogen_ENABLE_GPU_FP16)
+    set(HYDROGEN_GPU_USE_FP16 TRUE)
+  endif ()
 endif ()
 
-if (Hydrogen_ENABLE_ALUMINUM)
-  find_package(Aluminum 1.0.0 CONFIG QUIET)
+find_package(Aluminum 1.0.0 CONFIG QUIET)
+if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
+  message(STATUS
+    "A candidate Aluminum > v1.0.0 was found, but was not selected:")
+  message(STATUS
+    "  ${Aluminum_NOT_FOUND_MESSAGE}")
+endif ()
+# Try again, since we're technically ok with >v0.7.0
+if (NOT Aluminum_FOUND)
+  find_package(Aluminum 0.7.0 CONFIG QUIET)
   if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
     message(STATUS
-      "A candidate Aluminum > v1.0.0 was found, but was not selected:")
+      "A candidate Aluminum > v0.7.0 was found, but was not selected:")
     message(STATUS
       "  ${Aluminum_NOT_FOUND_MESSAGE}")
   endif ()
-  # Try again, since we're technically ok with >v0.7.0
-  if (NOT Aluminum_FOUND)
-    find_package(Aluminum 0.7.0 CONFIG QUIET)
-    if (NOT Aluminum_FOUND AND Aluminum_NOT_FOUND_MESSAGE)
-      message(STATUS
-        "A candidate Aluminum > v0.7.0 was found, but was not selected:")
-      message(STATUS
-        "  ${Aluminum_NOT_FOUND_MESSAGE}")
-    endif ()
-  endif ()
+endif ()
 
-  if (Aluminum_FOUND)
-    set(HYDROGEN_HAVE_ALUMINUM TRUE)
-    message(STATUS
-      "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}")
+if (Aluminum_FOUND)
+  set(HYDROGEN_HAVE_ALUMINUM TRUE)
+  message(STATUS
+    "Found Aluminum@${ALUMINUM_VERSION}: ${Aluminum_DIR}")
 
-    if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
-      set(HYDROGEN_HAVE_NCCL2 TRUE)
-      message(STATUS "Aluminum detected with NCCL2 backend support.")
-    else ()
-      set(HYDROGEN_HAVE_NCCL2 FALSE)
-    endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
+  if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
+    set(HYDROGEN_HAVE_NCCL2 TRUE)
+    message(STATUS "Aluminum detected with NCCL2 backend support.")
+  else ()
+    set(HYDROGEN_HAVE_NCCL2 FALSE)
+  endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
 
-    if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
-      set(HYDROGEN_HAVE_AL_HOST_XFER TRUE)
-      message(STATUS "Aluminum detected with HostTransfer backend support.")
-    else ()
-      set(HYDROGEN_HAVE_AL_HOST_XFER FALSE)
-    endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
+  if (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
+    set(HYDROGEN_HAVE_AL_HOST_XFER TRUE)
+    message(STATUS "Aluminum detected with HostTransfer backend support.")
+  else ()
+    set(HYDROGEN_HAVE_AL_HOST_XFER FALSE)
+  endif (HYDROGEN_HAVE_GPU AND AL_HAS_HOST_TRANSFER)
 
-    if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
-      set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE)
-      message(STATUS "Aluminum detected with MPI-CUDA backend support.")
-    else ()
-      set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE)
-    endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+  if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+    set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE)
+    message(STATUS "Aluminum detected with MPI-CUDA backend support.")
+  else ()
+    set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE)
+  endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
+
+  # Check for in-place SendRecv.
+  if (ALUMINUM_VERSION VERSION_GREATER_EQUAL "1.3.0")
+    set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV TRUE)
   else ()
-    message(FATAL_ERROR "Aluminum support requested but not found. "
-      "Please set Aluminum_DIR to point to the installation prefix "
-      "for Aluminum.")
-  endif (Aluminum_FOUND)
-endif (Hydrogen_ENABLE_ALUMINUM)
+    set(HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV FALSE)
+  endif ()
+
+  if (HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV)
+    message(STATUS "Aluminum detected with in-place SendRecv support.")
+  else ()
+    message(STATUS "Aluminum detected WITHOUT in-place SendRecv support.")
+  endif ()
+
+else ()
+
+  message(FATAL_ERROR "Aluminum support required but not found. "
+    "Please set Aluminum_ROOT to its installation prefix or add "
+    "the installation prefix to CMAKE_PREFIX_PATH.")
+
+endif (Aluminum_FOUND)
 
 # Sets up EL_RESTRICT and EL_HAVE_PRETTY_FUNCTION
 include(detect/CXX)
@@ -484,7 +511,7 @@ add_subdirectory(src)
 # docs (which has the advantage that preprocessing will take
 # "{,hydrogen_}config.h" into consideration).
 configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/config.h.in"
-  "${PROJECT_BINARY_DIR}/include/El/config.h")
+  "${PROJECT_BINARY_DIR}/include/El/config.h" ESCAPE_QUOTES)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/hydrogen_config.h.in"
   "${PROJECT_BINARY_DIR}/include/El/hydrogen_config.h")
 configure_file("${PROJECT_SOURCE_DIR}/doxy/Doxyfile.in"
diff --git a/ElementalREADME.md b/ElementalREADME.md
index 570370b917..1e9dae8091 100644
--- a/ElementalREADME.md
+++ b/ElementalREADME.md
@@ -1,10 +1,3 @@
-<p align="left" style="padding: 20px">
-<img src="http://libelemental.org/_static/elemental.png">
-</p>
-
-[![Build Status](https://api.travis-ci.org/elemental/Elemental.svg?branch=master)](https://travis-ci.org/elemental/Elemental)
-[![Join the chat at https://gitter.im/elemental/chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/elemental/chat?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-
 **Elemental** is a modern C++ library for distributed-memory dense and
 sparse-direct linear algebra, conic optimization, and lattice reduction.
 The library was initially released in
@@ -15,7 +8,7 @@ was originally released during a project on [Parallel Sweeping Preconditioners](
 
 ### Documentation
 
-The (now outdated) [documentation for Elemental](http://libelemental.org/documentation) is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html)
+The (now outdated) documentation for Elemental is built using [Sphinx](http://sphinx.pocoo.org) and the [Read the Docs Theme](http://docs.readthedocs.org/en/latest/theme.html)
 
 ### Unique features
 
diff --git a/cmake/configure_files/HydrogenConfig.cmake.in b/cmake/configure_files/HydrogenConfig.cmake.in
index 8127f803ec..add71cc1b5 100644
--- a/cmake/configure_files/HydrogenConfig.cmake.in
+++ b/cmake/configure_files/HydrogenConfig.cmake.in
@@ -92,6 +92,7 @@ if (_HYDROGEN_HAVE_CUDA)
 endif ()
 
 set(_HYDROGEN_HAVE_ROCM @HYDROGEN_HAVE_ROCM@)
+set(_HYDROGEN_HAVE_ROCTRACER @HYDROGEN_HAVE_ROCTRACER@)
 if (_HYDROGEN_HAVE_ROCM)
 
   find_dependency(hip CONFIG)
@@ -106,6 +107,14 @@ if (_HYDROGEN_HAVE_ROCM)
   find_dependency(rocblas CONFIG)
   find_dependency(rocsolver CONFIG)
   find_dependency(rocthrust CONFIG)
+
+  if (_HYDROGEN_HAVE_ROCTRACER)
+    find_dependency(Roctracer MODULE COMPONENTS roctx)
+  endif ()
+
+  include(HydrogenCleanupHIPTargets)
+  h_clean_hip_targets()
+
   set(HYDROGEN_HAVE_ROCM TRUE)
 endif (_HYDROGEN_HAVE_ROCM)
 
@@ -114,7 +123,7 @@ if (HYDROGEN_HAVE_HALF)
   find_dependency(HALF)
 endif ()
 
-if (_HYDROGEN_HAVE_CUDA)
+if (_HYDROGEN_HAVE_CUDA OR _HYDROGEN_HAVE_ROCM)
   set(HYDROGEN_GPU_USE_FP16 @HYDROGEN_GPU_USE_FP16@)
 endif ()
 
diff --git a/cmake/configure_files/hydrogen_config.h.in b/cmake/configure_files/hydrogen_config.h.in
index a3f7434d2d..02c7993b92 100644
--- a/cmake/configure_files/hydrogen_config.h.in
+++ b/cmake/configure_files/hydrogen_config.h.in
@@ -40,6 +40,7 @@
 
 // ROCm stuff
 #cmakedefine HYDROGEN_HAVE_ROCM
+#cmakedefine HYDROGEN_HAVE_ROCTRACER
 
 // General GPU stuff
 #cmakedefine HYDROGEN_HAVE_CUB
@@ -51,6 +52,7 @@
 #cmakedefine HYDROGEN_HAVE_NCCL2
 #cmakedefine HYDROGEN_HAVE_AL_MPI_CUDA
 #cmakedefine HYDROGEN_HAVE_AL_HOST_XFER
+#cmakedefine HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV
 
 #cmakedefine HYDROGEN_ENSURE_HOST_MPI_BUFFERS
 #cmakedefine HYDROGEN_HAVE_CUDA_AWARE_MPI
diff --git a/cmake/modules/FindRoctracer.cmake b/cmake/modules/FindRoctracer.cmake
new file mode 100644
index 0000000000..d209bb1fee
--- /dev/null
+++ b/cmake/modules/FindRoctracer.cmake
@@ -0,0 +1,78 @@
+# Sets the following variables
+#
+#   Roctracer_FOUND
+#   Roctracer_LIBRARIES
+#
+# Defines the following imported target:
+#
+#   roctracer::roctracer
+#   roctracer::roctracer_api
+#   roctracer::roctx_api
+#
+
+set(_supported_components roctracer roctx)
+if (NOT Roctracer_FIND_COMPONENTS)
+  set(Roctracer_FIND_COMPONENTS ${_supported_components})
+endif ()
+
+foreach (comp IN LISTS Roctracer_FIND_COMPONENTS)
+  if (NOT ${comp} IN_LIST _supported_components)
+    message(FATAL_ERROR
+      "Cannot specify component \"${comp}\" for package Roctracer. "
+      "Supported components are: ${_supported_components}.")
+  endif ()
+
+  set(_header_name "${comp}.h")
+  set(_lib_name "${comp}64")
+
+  find_path(${comp}_INCLUDE_PATH ${_header_name}
+    HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
+    PATH_SUFFIXES include
+    DOC "The ${comp} include directory for roctracer."
+    NO_DEFAULT_PATH)
+  find_path(${comp}_INCLUDE_PATH ${_header_name}
+    HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer
+    DOC "The ${comp} include directory for roctracer."
+    NO_DEFAULT_PATH)
+  find_path(${comp}_INCLUDE_PATH ${_header_name})
+
+  find_library(${comp}_LIBRARY ${_lib_name}
+    HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer
+    HINTS ${ROCM_PATH} $ENV{ROCM_PATH}
+    PATH_SUFFIXES lib64 lib
+    DOC "The ${comp} library for roctracer."
+    NO_DEFAULT_PATH)
+  find_library(${comp}_LIBRARY ${_lib_name})
+
+  if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH)
+    set(Roctracer_${comp}_FOUND TRUE)
+
+    if (NOT TARGET roctracer::${comp}_api)
+      add_library(roctracer::${comp}_api INTERFACE IMPORTED)
+    endif ()
+    target_link_libraries(roctracer::${comp}_api INTERFACE
+      "${${comp}_LIBRARY}")
+    target_include_directories(roctracer::${comp}_api INTERFACE
+      "${${comp}_INCLUDE_PATH}")
+
+    mark_as_advanced(${comp}_LIBRARY)
+    mark_as_advanced(${comp}_INCLUDE_PATH)
+
+    list(APPEND _imported_libraries roctracer::${comp}_api)
+  else ()
+    set(Roctracer_${comp}_FOUND FALSE)
+  endif ()
+endforeach ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS)
+
+if (Roctracer_FOUND)
+  if (NOT TARGET roctracer::roctracer)
+    add_library(roctracer::roctracer INTERFACE IMPORTED)
+  endif ()
+  foreach (lib IN LISTS _imported_libraries)
+    target_link_libraries(roctracer::roctracer INTERFACE ${lib})
+  endforeach ()
+  set(Roctracer_LIBRARIES roctracer::roctracer)
+endif (Roctracer_FOUND)
diff --git a/cmake/modules/HydrogenCleanupHIPTargets.cmake b/cmake/modules/HydrogenCleanupHIPTargets.cmake
new file mode 100644
index 0000000000..90d12b2590
--- /dev/null
+++ b/cmake/modules/HydrogenCleanupHIPTargets.cmake
@@ -0,0 +1,54 @@
+# This just finds some stuff correctly and cleans up the HIP targets.
+macro(h_clean_hip_targets)
+  set(HIP_CLANG_ROOT "$ENV{ROCM_PATH}/llvm")
+
+  file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS
+    "${HIP_CLANG_ROOT}/lib/clang/*/include")
+  find_path(HIP_CLANG_INCLUDE_PATH stddef.h
+    HINTS "${HIP_CLANG_INCLUDE_SEARCH_PATHS}"
+    NO_DEFAULT_PATH)
+
+  if (HIP_CLANG_INCLUDE_PATH)
+    message(STATUS "Found clang include path: ${HIP_CLANG_INCLUDE_PATH}")
+  else ()
+    message(WARNING
+      "Could not find clang include path. "
+      "Using whatever is in the hip IMPORTED targets")
+  endif ()
+  
+  file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS
+    "${HIP_CLANG_ROOT}/lib/clang/*/lib/*")
+  find_library(ACTUAL_CLANGRT_BUILTINS clangrt-builtins
+    NAMES
+    clang_rt.builtins
+    clang_rt.builtins-x86_64
+    PATHS
+    "${HIP_CLANGRT_LIB_SEARCH_PATHS}")
+
+  if (ACTUAL_CLANGRT_BUILTINS)
+    message(STATUS "Found clangrt builtins: ${ACTUAL_CLANGRT_BUILTINS}")
+  else ()
+    message(WARNING
+      "Could not find clangrt builtins. "
+      "Using whatever is in the hip IMPORTED targets")
+  endif ()
+
+  get_target_property(_HIP_HOST_LIBS hip::host INTERFACE_LINK_LIBRARIES)
+  get_target_property(_HIP_DEVICE_LIBS hip::device INTERFACE_LINK_LIBRARIES)
+
+  string(REPLACE
+    "CLANGRT_BUILTINS-NOTFOUND"
+    "${ACTUAL_CLANGRT_BUILTINS}"
+    _NEW_HIP_HOST_LIBS
+    "${_HIP_HOST_LIBS}")
+  string(REPLACE
+    "CLANGRT_BUILTINS-NOTFOUND"
+    "${ACTUAL_CLANGRT_BUILTINS}"
+    _NEW_HIP_DEVICE_LIBS
+    "${_HIP_DEVICE_LIBS}")
+  
+  set_property(TARGET hip::host
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_HOST_LIBS})
+  set_property(TARGET hip::device
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_NEW_HIP_DEVICE_LIBS})
+endmacro()
diff --git a/include/El/blas_like/level1/Copy/Translate.hpp b/include/El/blas_like/level1/Copy/Translate.hpp
index 55579afb13..db3f68718e 100644
--- a/include/El/blas_like/level1/Copy/Translate.hpp
+++ b/include/El/blas_like/level1/Copy/Translate.hpp
@@ -67,9 +67,15 @@ void Translate(
         const Int maxWidth  = MaxLength(width,  rowStride);
         const Int pkgSize = mpi::Pad(maxHeight*maxWidth);
 
-        simple_buffer<T,D1> buffer;
-        if(crossRank == root || crossRank == B.Root())
-            buffer.allocate(pkgSize);
+        // When crossRank == root, this will be a SEND buffer
+        // (+SendRecv when !aligned), and it should use
+        // syncInfoA. When crossRank == B.Root(), this will be a RECV
+        // buffer, and it should use syncInfoB. Otherwise, this isn't
+        // used.
+        simple_buffer<T,D1> buffer =
+            (crossRank == root || crossRank == B.Root()
+             ? simple_buffer<T,D1>(pkgSize, syncInfoA)
+             : simple_buffer<T,D1>{});
 
         const Int colAlignB = B.ColAlign();
         const Int rowAlignB = B.RowAlign();
diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
index 4783359c9c..77cf807c28 100644
--- a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
+++ b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
@@ -10,6 +10,8 @@
 #define EL_BLAS_COPY_TRANSLATEBETWEENGRIDS_HPP
 
 #include "core/environment/decl.hpp"
+#include <optional>
+
 namespace El
 {
 namespace copy
@@ -3562,7 +3564,6 @@ void TranslateBetweenGridsAsync
     const Int mLocA = A.LocalHeight();
     const Int nLocA = A.LocalWidth();
 
-
     mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
     mpi::Group owningGroupA = A.Grid().OwningGroup();
 
@@ -3856,7 +3857,8 @@ void TranslateBetweenGrids(
   EL_DEBUG_CSE;
 
   /* Overview
-
+     We broadcast the size of A to all the ranks in B to make sure that
+     all ranks in B subgrid has the correct size of A.
      Since we are using blocking communication, some care is required
      to avoid deadlocks. Let's start with a naive algorithm for
      [STAR,VC] matrices and optimize it in steps:
@@ -3883,21 +3885,55 @@ void TranslateBetweenGrids(
   */
 
   // Matrix dimensions
-  const Int m = A.Height();
-  const Int n = A.Width();
+  Int m = A.Height();
+  Int n = A.Width();
+  Int strideA = A.RowStride();
+  Int ALDim = A.LDim();
+
+  mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
+
+  bool const inAGrid = A.Participating();
+  bool const inBGrid = B.Participating();
+
+  Int recvMetaData[4];
+  Int metaData[4];
+  if(inAGrid)
+  {
+    metaData[0] = m;
+    metaData[1] = n;
+    metaData[2] = strideA;
+    metaData[3] = ALDim;
+  }
+  else
+  {
+    metaData[0] = 0;
+    metaData[1] = 0;
+    metaData[2] = 0;
+    metaData[3] = 0;
+  }
+  const std::vector<Int> sendMetaData (metaData, metaData + 4);
+  mpi::AllReduce(sendMetaData.data(),
+                 recvMetaData,
+                 4,
+                 mpi::MAX,
+                 viewingCommB,
+                 SyncInfo<El::Device::CPU>{});
+
+  m = recvMetaData[0];
+  n = recvMetaData[1];
+  strideA = recvMetaData[2];
+  ALDim =recvMetaData[3];
+
   B.Resize(m, n);
   const Int nLocA = A.LocalWidth();
   const Int nLocB = B.LocalWidth();
 
   // Return immediately if there is no local data
-  const bool inAGrid = A.Participating();
-  const bool inBGrid = B.Participating();
   if (!inAGrid && !inBGrid) {
     return;
   }
 
   // Compute the number of messages to send/recv
-  const Int strideA = A.RowStride();
   const Int strideB = B.RowStride();
   const Int strideGCD = GCD(strideA, strideB);
   const Int numSends = Min(strideB/strideGCD, nLocA);
@@ -3906,14 +3942,24 @@ void TranslateBetweenGrids(
   // Synchronize compute streams
   SyncInfo<D> syncInfoA = SyncInfoFromMatrix(A.LockedMatrix());
   SyncInfo<D> syncInfoB = SyncInfoFromMatrix(B.Matrix());
-  auto syncHelper = MakeMultiSync(syncInfoB, syncInfoA);
-  const SyncInfo<D>& syncInfo = syncHelper;
+
+  std::optional<MultiSync<D, D>> maybeMultiSync;
+  if (inAGrid && inBGrid)
+      maybeMultiSync.emplace(syncInfoB, syncInfoA);
+
+  SyncInfo<D> const syncInfo =
+      (maybeMultiSync.has_value()
+       ? *maybeMultiSync
+       : (inAGrid ? syncInfoA : syncInfoB));
+
+  // Collective!
+  mpi::EnsureComm<T, Collective::SEND>(viewingCommB, syncInfo);
+  mpi::EnsureComm<T, Collective::RECV>(viewingCommB, syncInfo);
 
   // Translate the ranks from A's VC communicator to B's viewing so
   // that we can match send/recv communicators. Since A's VC
   // communicator is not necessarily defined on every process, we
   // instead work with A's owning group.
-  mpi::Comm const& viewingCommB = B.Grid().ViewingComm();
   mpi::Group owningGroupA = A.Grid().OwningGroup();
   const int sizeA = A.Grid().Size();
   vector<int> viewingRanksA(sizeA), owningRanksA(sizeA);
@@ -3976,7 +4022,7 @@ void TranslateBetweenGrids(
       // Copy data locally
       copy::util::InterleaveMatrix(
         m, messageWidth,
-        A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(),
+        A.LockedBuffer(0,jLocA), 1, numSends*ALDim,
         B.Buffer(0,jLocB), 1, numRecvs*B.LDim(),
         syncInfo);
     }
@@ -3984,7 +4030,7 @@ void TranslateBetweenGrids(
       // Send data to other rank
       copy::util::InterleaveMatrix(
         m, messageWidth,
-        A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(),
+        A.LockedBuffer(0,jLocA), 1, numSends*ALDim,
         messageBuf.data(), 1, m,
         syncInfo);
       mpi::Send(
diff --git a/include/El/core.hpp b/include/El/core.hpp
index 173e0e6247..4009f55084 100644
--- a/include/El/core.hpp
+++ b/include/El/core.hpp
@@ -80,14 +80,10 @@ using hydrogen::gpu_half_type;
 #endif // HYDROGEN_GPU_USE_FP16
 }
 
-#if __cplusplus >= 201402L
-#define H_DEPRECATED(msg) [[deprecated(msg)]]
-#elif defined(__GNUC__)
-// This ^ isn't perfect -- many non-GCC compilers define __GNUC__.
-#define H_DEPRECATED(msg) __attribute__ ((deprecated(msg)))
-#else
+// NOTE: These have not been as inspirational as I had hoped. I'm
+// leaving the notes but preprocessing them away so the compile
+// warnings stop.
 #define H_DEPRECATED(msg)
-#endif
 
 #define EL_UNUSED(expr) (void)(expr)
 
diff --git a/include/El/core/Element/decl.hpp b/include/El/core/Element/decl.hpp
index 72d30b19c2..7cf87f7415 100644
--- a/include/El/core/Element/decl.hpp
+++ b/include/El/core/Element/decl.hpp
@@ -9,6 +9,7 @@
 #ifndef EL_ELEMENT_DECL_HPP
 #define EL_ELEMENT_DECL_HPP
 
+#include <hydrogen/utils/HalfPrecision.hpp>
 #include <El/core/Element/Complex/decl.hpp>
 #include <El/core/types.hpp>
 
@@ -262,7 +263,7 @@ template<typename S,typename T,
 struct Caster
 {
     static T Cast( const S& alpha )
-    { return T(alpha); }
+    { return static_cast<T>(alpha); }
 };
 
 template<typename S,typename T>
@@ -272,6 +273,37 @@ struct Caster<S,Complex<T>,void>
     { return Complex<T>( T(RealPart(alpha)), T(ImagPart(alpha)) ); }
 };
 
+#if defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16
+#if defined HYDROGEN_HAVE_HALF
+template <>
+struct Caster<gpu_half_type, cpu_half_type>
+{
+    static cpu_half_type Cast(gpu_half_type const& in)
+    {
+        return cpu_half_type(static_cast<float>(in));
+    }
+};// Caster<gpu_half_type, double>
+#endif // defined HYDROGEN_HAVE_HALF
+
+template <>
+struct Caster<gpu_half_type, double>
+{
+    static double Cast(gpu_half_type const& in)
+    {
+        return static_cast<float>(in);
+    }
+};// Caster<gpu_half_type, double>
+
+template <>
+struct Caster<gpu_half_type,Complex<double>>
+{
+    static Complex<double> Cast(gpu_half_type const& alpha)
+    {
+        return Complex<double>(static_cast<float>(alpha), gpu_half_type(0));
+    }
+};
+#endif // defined HYDROGEN_HAVE_ROCM && defined HYDROGEN_GPU_USE_FP16
+
 // Set the real/imaginary part of an element
 // -----------------------------------------
 template<typename Real,
diff --git a/include/El/core/MemoryPool.hpp b/include/El/core/MemoryPool.hpp
index 4dc6c7c91c..97d94c9306 100644
--- a/include/El/core/MemoryPool.hpp
+++ b/include/El/core/MemoryPool.hpp
@@ -10,6 +10,7 @@
 
 #include <cstddef>
 #include <cstdlib>
+#include <iostream>
 #include <mutex>
 #include <set>
 #include <sstream>
@@ -21,6 +22,7 @@ namespace El
 {
 namespace details
 {
+
 template <typename... Args>
 void ThrowRuntimeError(Args&&... args)
 {
@@ -29,6 +31,25 @@ void ThrowRuntimeError(Args&&... args)
     (void) dummy;
     throw std::runtime_error(oss.str());
 }
+
+/** @brief Returns true iff env(H_MEMPOOL_DEBUG) is truthy.
+ *
+ *  Truthy values are non-empty strings that start with any character
+ *  other than '0' (ASCII "zero"). So "true", "false", "1", "13",
+ *  "-q", ":)", and " " are all truthy, while "", "0true", "0false",
+ *  "0000", "0123", and "0:)" are all falsey.
+ */
+bool debug_mempool() noexcept;
+
+/** @brief Check env(H_MEMPOOL_BIN_GROWTH). Default 1.6f. */
+float default_mempool_bin_growth() noexcept;
+
+/** @brief Check env(H_MEMPOOL_MIN_BIN). Default 1UL. */
+size_t default_mempool_min_bin() noexcept;
+
+/** @brief Check env(H_MEMPOOL_MAX_BIN). Default (1<<26). */
+size_t default_mempool_max_bin() noexcept;
+
 } // namespace details
 
 /** Simple caching memory pool.
@@ -50,10 +71,13 @@ class MemoryPool
      *  @param bin_growth Controls how fast bins grow.
      *  @param min_bin_size Smallest bin size (in bytes).
      *  @param max_bin_size Largest bin size (in bytes).
+     *  @param debug Print debugging messages.
      */
-    MemoryPool(float bin_growth = 1.6,
-               size_t min_bin_size = 1,
-               size_t max_bin_size = 1<<26)
+    MemoryPool(float const bin_growth = details::default_mempool_bin_growth(),
+               size_t const min_bin_size = details::default_mempool_min_bin(),
+               size_t const max_bin_size = details::default_mempool_max_bin(),
+               bool const debug = details::debug_mempool())
+        : debug_{debug}
     {
         std::set<size_t> bin_sizes;
         for (float bin_size = min_bin_size;
@@ -74,23 +98,47 @@ class MemoryPool
         // Set up bins.
         for (size_t i = 0; i < bin_sizes_.size(); ++i)
             free_data_.emplace_back();
+        if (debug_)
+        {
+            std::clog << "==Mempool(" << this << ")== "
+                      << "Created memory pool ("
+                      << "pinned=" << (Pinned ? "t" : "f")
+                      << ", growth=" << bin_growth
+                      << ", min bin=" << bin_sizes_.front()
+                      << ", max bin=" << bin_sizes_.back() << ")\n"
+                      << "==Mempool(" << this << ")== "
+                      << "Bin sizes: [";
+            for (auto const& b : bin_sizes_)
+                std::clog << " " << b;
+            std::clog << " ]" << std::endl;
+        }
     }
     ~MemoryPool()
     {
         FreeAllUnused();
+        if (debug_)
+            std::clog << "==Mempool(" << this << ")== "
+                      << alloc_to_bin_.size()
+                      << " dangling allocations\n"
+                      << "==Mempool(" << this << ")== "
+                      << "Destroyed memory pool"
+                      << std::endl;
     }
 
     /** Return memory of size bytes. */
     void* Allocate(size_t size)
     {
-        size_t bin = get_bin(size);
+        if (debug_)
+            std::clog << "==Mempool(" << this << ")== "
+                      << "Requesting allocation of "
+                      << size << " bytes."
+                      << std::endl;
+        size_t const bin = get_bin(size);
         void* mem = nullptr;
         std::lock_guard<std::mutex> lock(mutex_);
         // size is too large, this will not be cached.
         if (bin == INVALID_BIN)
-        {
             mem = do_allocation(size);
-        }
         else
         {
             // Check if there is available memory in our bin.
@@ -98,6 +146,10 @@ class MemoryPool
             {
                 mem = free_data_[bin].back();
                 free_data_[bin].pop_back();
+                --num_cached_blks_;
+                if (debug_)
+                    std::clog << "==Mempool(" << this << ")== "
+                              << "Reusing cached pointer " << mem << "\n";
             }
             else
             {
@@ -105,41 +157,45 @@ class MemoryPool
             }
         }
         alloc_to_bin_[mem] = bin;
+        if (debug_)
+            std::clog << "==Mempool(" << this << ")== "
+                      << alloc_to_bin_.size()
+                      << " blocks allocated; "
+                      << num_cached_blks_
+                      << " blocks cached"
+                      << std::endl;
+
         return mem;
     }
     /** Release previously allocated memory. */
     void Free(void* ptr)
     {
         std::lock_guard<std::mutex> lock(mutex_);
-        auto iter = alloc_to_bin_.find(ptr);
+        auto const iter = alloc_to_bin_.find(ptr);
         if (iter == alloc_to_bin_.end())
-        {
             details::ThrowRuntimeError("Tried to free unknown ptr");
-        }
+
+        size_t const& bin = iter->second;
+        alloc_to_bin_.erase(iter);
+        if (bin == INVALID_BIN)
+            do_free(ptr);
         else
         {
-            size_t bin = iter->second;
-            alloc_to_bin_.erase(iter);
-            if (bin == INVALID_BIN)
-            {
-                do_free(ptr);
-            }
-            else
-            {
-                // Cache the pointer for reuse.
-                free_data_[bin].push_back(ptr);
-            }
+            // Cache the pointer for reuse.
+            free_data_[bin].push_back(ptr);
+            ++num_cached_blks_;
+            if (debug_)
+                std::clog << "==Mempool(" << this << ")== "
+                          << "Cached pointer " << ptr << "\n";
         }
+        if (debug_)
+            std::clog << "==Mempool(" << this << ")== "
+                      << alloc_to_bin_.size()
+                      << " blocks allocated; "
+                      << num_cached_blks_
+                      << " blocks cached"
+                      << std::endl;
     }
-    /** Release all unused memory. */
-    void FreeAllUnused()
-    {
-        std::lock_guard<std::mutex> lock(mutex_);
-        for (size_t bin = 0; bin < bin_sizes_.size(); ++bin)
-            for (auto&& ptr : free_data_[bin])
-                do_free(ptr);
-    }
-
 private:
 
     /** Index of an invalid bin. */
@@ -158,6 +214,25 @@ class MemoryPool
     /** Map used pointers to the associated bin index. */
     std::unordered_map<void*, size_t> alloc_to_bin_;
 
+    /** Track the total number of available blocks. */
+    size_t num_cached_blks_;
+
+    /** Print debugging messages throughout lifetime. */
+    bool debug_;
+
+    /** Release all unused memory. */
+    void FreeAllUnused()
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        for (size_t bin = 0; bin < bin_sizes_.size(); ++bin)
+        {
+            for (auto&& ptr : free_data_[bin])
+                do_free(ptr);
+            std::vector<void*>{}.swap(free_data_[bin]);
+        }
+        num_cached_blks_ = 0ul;
+    }
+
     /** Allocate size bytes. */
     inline void* do_allocation(size_t size);
     /** Free ptr. */
@@ -179,7 +254,7 @@ class MemoryPool
 
 #ifdef HYDROGEN_HAVE_CUDA
 template <>
-inline void* MemoryPool<true>::do_allocation(size_t bytes)
+inline void* MemoryPool<true>::do_allocation(size_t const bytes)
 {
     void* ptr;
     auto error = cudaMallocHost(&ptr, bytes);
@@ -189,11 +264,15 @@ inline void* MemoryPool<true>::do_allocation(size_t bytes)
             "Failed to allocate CUDA pinned memory with message: ",
             "\"", cudaGetErrorString(error), "\"");
     }
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Allocated pinned " << bytes << " bytes at " << ptr
+                  << std::endl;
     return ptr;
 }
 
 template<>
-inline void MemoryPool<true>::do_free(void* ptr)
+inline void MemoryPool<true>::do_free(void* const ptr)
 {
     auto error = cudaFreeHost(ptr);
     if (error != cudaSuccess)
@@ -202,49 +281,64 @@ inline void MemoryPool<true>::do_free(void* ptr)
             "Failed to free CUDA pinned memory with message: ",
             "\"", cudaGetErrorString(error), "\"");
     }
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Freed pinned ptr " << ptr
+                  << std::endl;
 }
 #elif defined(HYDROGEN_HAVE_ROCM)
 template <>
-inline void* MemoryPool<true>::do_allocation(size_t bytes)
+inline void* MemoryPool<true>::do_allocation(size_t const bytes)
 {
     void* ptr;
     auto error = hipHostMalloc(&ptr, bytes);
     if (error != hipSuccess)
-    {
         details::ThrowRuntimeError(
             "Failed to allocate HIP pinned memory with message: ",
             "\"", hipGetErrorString(error), "\"");
-    }
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Allocated pinned " << bytes << " bytes at " << ptr
+                  << std::endl;
     return ptr;
 }
 
 template<>
-inline void MemoryPool<true>::do_free(void* ptr)
+inline void MemoryPool<true>::do_free(void* const ptr)
 {
     auto error = hipHostFree(ptr);
     if (error != hipSuccess)
-    {
         details::ThrowRuntimeError(
             "Failed to free HIP pinned memory with message: ",
             "\"", hipGetErrorString(error), "\"");
-    }
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Freed pinned ptr " << ptr
+                  << std::endl;
 }
 #endif  // HYDROGEN_HAVE_CUDA
 
 template <>
-inline void* MemoryPool<false>::do_allocation(size_t bytes) {
+inline void* MemoryPool<false>::do_allocation(size_t const bytes)
+{
     void* ptr = std::malloc(bytes);
     if (ptr == nullptr)
-    {
         details::ThrowRuntimeError("Failed to allocate memory");
-    }
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Allocated " << bytes << " bytes at " << ptr
+                  << std::endl;
     return ptr;
 }
 
 template<>
-inline void MemoryPool<false>::do_free(void* ptr)
+inline void MemoryPool<false>::do_free(void* const ptr)
 {
-    return std::free(ptr);
+    std::free(ptr);
+    if (debug_)
+        std::clog << "==Mempool(" << this << ")== "
+                  << "Freed ptr " << ptr
+                  << std::endl;
 }
 
 #ifdef HYDROGEN_HAVE_GPU
diff --git a/include/El/core/Profiling.hpp b/include/El/core/Profiling.hpp
index a09199ff17..45d2a7181b 100644
--- a/include/El/core/Profiling.hpp
+++ b/include/El/core/Profiling.hpp
@@ -19,6 +19,10 @@ void DisableVTune() noexcept;
 void EnableNVProf() noexcept;
 void DisableNVProf() noexcept;
 
+// These are no-ops if roctracer is not enabled at compile time
+void EnableROCTX() noexcept;
+void DisableROCTX() noexcept;
+
 /** \brief A selection of colors to use with the profiling interface.
  *
  *  It seems unlikely that a user will ever need to access these by
diff --git a/include/El/core/environment/decl.hpp b/include/El/core/environment/decl.hpp
index 8426ee1f9d..b9367fde42 100644
--- a/include/El/core/environment/decl.hpp
+++ b/include/El/core/environment/decl.hpp
@@ -35,7 +35,10 @@ using std::ostream;
 using std::ostringstream;
 
 using std::exception;
-using std::uncaught_exception;
+inline bool uncaught_exception() noexcept
+{
+  return std::uncaught_exceptions() > 0;
+}
 
 void PrintVersion( ostream& os=cout );
 void PrintConfig( ostream& os=cout );
diff --git a/include/El/core/imports/aluminum.hpp b/include/El/core/imports/aluminum.hpp
index 6cc8391727..67da59d507 100644
--- a/include/El/core/imports/aluminum.hpp
+++ b/include/El/core/imports/aluminum.hpp
@@ -31,7 +31,11 @@ enum class Collective
     REDUCE,
     REDUCESCATTER,
     SCATTER,
-    SENDRECV
+
+    // Not collectives, but what can you do
+    SENDRECV,
+    SEND,
+    RECV,
 };// enum class Collective
 
 #ifndef HYDROGEN_HAVE_ALUMINUM
@@ -115,6 +119,8 @@ ADD_ALUMINUM_COLLECTIVE(       Collective::REDUCE, Al::MPIBackend);
 ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::MPIBackend);
 ADD_ALUMINUM_COLLECTIVE(      Collective::SCATTER, Al::MPIBackend);
 ADD_ALUMINUM_COLLECTIVE(     Collective::SENDRECV, Al::MPIBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::SEND, Al::MPIBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::RECV, Al::MPIBackend);
 
 #ifdef HYDROGEN_HAVE_NCCL2
 // NCCL backend supports these
@@ -126,7 +132,9 @@ ADD_ALUMINUM_COLLECTIVE(       Collective::GATHER, Al::NCCLBackend);
 ADD_ALUMINUM_COLLECTIVE(       Collective::REDUCE, Al::NCCLBackend);
 ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::NCCLBackend);
 ADD_ALUMINUM_COLLECTIVE(      Collective::SCATTER, Al::NCCLBackend);
-//ADD_ALUMINUM_COLLECTIVE(     Collective::SENDRECV, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE(     Collective::SENDRECV, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::SEND, Al::NCCLBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::RECV, Al::NCCLBackend);
 #endif // HYDROGEN_HAVE_NCCL2
 
 #ifdef HYDROGEN_HAVE_AL_HOST_XFER
@@ -140,6 +148,8 @@ ADD_ALUMINUM_COLLECTIVE(       Collective::REDUCE, Al::HostTransferBackend);
 ADD_ALUMINUM_COLLECTIVE(Collective::REDUCESCATTER, Al::HostTransferBackend);
 ADD_ALUMINUM_COLLECTIVE(      Collective::SCATTER, Al::HostTransferBackend);
 ADD_ALUMINUM_COLLECTIVE(     Collective::SENDRECV, Al::HostTransferBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::SEND, Al::HostTransferBackend);
+ADD_ALUMINUM_COLLECTIVE(         Collective::RECV, Al::HostTransferBackend);
 #endif // HYDROGEN_HAVE_AL_HOST_XFER
 
 template <Device D>
@@ -349,10 +359,26 @@ struct SyncInfoManager<Device::GPU>
 };
 #endif // HYDROGEN_HAVE_GPU
 
+inline bool use_separate_comm_stream() noexcept
+{
+    char const* const env = std::getenv("H_USE_SEPARATE_COMM_STREAM");
+    return (env && std::strlen(env) && env[0] != '0');
+}
+
 template <typename BackendT>
 SyncInfo<DeviceForBackend<BackendT>()> const& BackendSyncInfo()
 {
     constexpr Device D = DeviceForBackend<BackendT>();
+#ifdef HYDROGEN_HAVE_GPU
+    if constexpr (D == El::Device::GPU)
+    {
+        static bool const use_separate_stream = use_separate_comm_stream();
+        if (!use_separate_stream)
+        {
+            return El::gpu::DefaultSyncInfo();
+        }
+    }
+#endif // HYDROGEN_HAVE_GPU
     static SyncInfoManager<D> si_mgr_(BackendT::Name());
     return si_mgr_.si_;
 }
diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp
index d51c4d1384..a2527fd27c 100644
--- a/include/El/core/imports/mpi.hpp
+++ b/include/El/core/imports/mpi.hpp
@@ -173,6 +173,15 @@ extern template struct Types<long long int>; // Avoid conflict with Int
 #undef PROTO
 #endif // !defined H_INSTANTIATING_MPI_TYPES_STRUCT
 
+#ifdef HYDROGEN_HAVE_HALF
+extern template struct Types<cpu_half_type>;
+extern template struct Types<Entry<cpu_half_type>>;
+#endif
+#ifdef HYDROGEN_GPU_USE_FP16
+extern template struct Types<gpu_half_type>;
+extern template struct Types<Entry<gpu_half_type>>;
+#endif
+
 template<typename T>
 struct MPIBaseHelper { typedef T value; };
 template<typename T>
diff --git a/include/hydrogen/PoolAllocator.hpp b/include/hydrogen/PoolAllocator.hpp
new file mode 100644
index 0000000000..b251a7171f
--- /dev/null
+++ b/include/hydrogen/PoolAllocator.hpp
@@ -0,0 +1,1145 @@
+// See LICENSE for Hydrogen license. Original license for CUB follows:
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Extended asynchronous pooling allocator with exponential, multiplicative, and
+ * user-specified bin sizes. This allocator is based on CUB's pooling allocator
+ * and can use {cuda,hip}MallocAsync as necessary. It also provides extensive
+ * reporting for allocations, bins, and extraneous memory.
+ ******************************************************************************/
+
+#ifndef HYDROGEN_POOLALLOCATOR_HPP_
+#define HYDROGEN_POOLALLOCATOR_HPP_
+
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <set>
+
+#include <El/hydrogen_config.h>
+
+// Set up functions
+#ifdef HYDROGEN_HAVE_CUDA
+#include <cuda_runtime.h>
+
+#define gpuMallocAsync cudaMallocAsync
+#define gpuMalloc cudaMalloc
+#define gpuFreeAsync cudaFreeAsync
+#define gpuFree cudaFree
+#define gpuSetDevice cudaSetDevice
+#define gpuGetDevice cudaGetDevice
+#define gpuEventQuery cudaEventQuery
+#define gpuGetLastError cudaGetLastError
+#define gpuEventDestroy cudaEventDestroy
+#define gpuEventCreateWithFlags cudaEventCreateWithFlags
+#define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuEventRecord cudaEventRecord
+#define gpuGetErrorString cudaGetErrorString
+
+#define gpuStream_t cudaStream_t
+#define gpuEvent_t cudaEvent_t
+#define gpuError_t cudaError_t
+
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuErrorMemoryAllocation cudaErrorMemoryAllocation
+
+#ifndef GPU_PTX_ARCH
+#ifndef __CUDA_ARCH__
+#define GPU_PTX_ARCH 0
+#else
+#define GPU_PTX_ARCH __CUDA_ARCH__
+#endif
+#endif
+
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hip/hip_runtime.h>
+
+#define gpuMallocAsync hipMallocAsync
+#define gpuMalloc hipMalloc
+#define gpuFreeAsync hipFreeAsync
+#define gpuFree hipFree
+#define gpuSetDevice hipSetDevice
+#define gpuGetDevice hipGetDevice
+#define gpuEventQuery hipEventQuery
+#define gpuGetLastError hipGetLastError
+#define gpuEventDestroy hipEventDestroy
+#define gpuEventCreateWithFlags hipEventCreateWithFlags
+#define gpuEventDisableTiming hipEventDisableTiming
+#define gpuEventRecord hipEventRecord
+#define gpuGetErrorString hipGetErrorString
+
+#define gpuStream_t hipStream_t
+#define gpuEvent_t hipEvent_t
+#define gpuError_t hipError_t
+
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuErrorMemoryAllocation hipErrorMemoryAllocation
+
+#ifndef GPU_PTX_ARCH
+#ifndef __HIP_ARCH__
+#define GPU_PTX_ARCH 0
+#else
+#define GPU_PTX_ARCH 1
+#endif
+#endif
+#else
+#error "This file must be included with a GPU (CUDA/ROCm) environment"
+#endif
+
+namespace gpudebug {
+/* Minimal copy of CubDebug */
+__host__ __device__ __forceinline__ gpuError_t Debug(gpuError_t error,
+                                                     const char *filename,
+                                                     int line) {
+  if (error) {
+#if (GPU_PTX_ARCH == 0)
+    fprintf(stderr, "GPU error %d [%s, %d]: %s\n", error, filename, line,
+            gpuGetErrorString(error));
+    fflush(stderr);
+#else
+    printf("GPU error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n",
+           error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y,
+           threadIdx.x, filename, line);
+#endif
+  }
+  return error;
+}
+} // namespace gpudebug
+
+/**
+ * \brief Debug macro
+ */
+#ifndef gpuDebug
+#define gpuDebug(e) gpudebug::Debug((gpuError_t)(e), __FILE__, __LINE__)
+#endif
+
+/**
+ * Prints human-readable size (for reporting)
+ */
+static inline void HumanReadableSize(size_t bytes, std::ostream &os) {
+  const std::string sizes[] = {"B", "KiB", "MiB", "GiB", "TiB"};
+  int unit = 0;
+  float size = bytes;
+  while (size > 1024) {
+    size /= 1024;
+    ++unit;
+  }
+  auto oldprec = os.precision(unit > 0 ? 2 : 0);
+  auto oldf = os.setf(std::ios_base::fixed, std::ios_base::floatfield);
+  os << size << " " << sizes[unit];
+  os.precision(oldprec);
+  os.setf(oldf);
+}
+
+namespace hydrogen {
+
+/******************************************************************************
+ * PooledDeviceAllocator
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing
+ * cached device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.
+ * Once freed, the allocation becomes available immediately for reuse within the
+ * \p active_stream with which it was associated with during allocation, and it
+ * becomes available for reuse within other streams when all prior work
+ * submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation
+ * request of a given size will only consider cached allocations within the
+ * corresponding bin.
+ * - (EXTENDED) Bin limits have a combined geometric/linear progression; or can
+ *   be given as a set of sizes. It behaves as follows:
+ *     - If a set of sizes is given in \p bin_sizes, they are used to construct
+ *       the allocation bins. If an allocation is larger than the largest bin, the
+ *       behavior matches the rest of the algorithm. Allocations in [0, bin_min]
+ *       will allocate ``bin_min`` bytes.
+ *     - Bin limits progress geometrically in accordance with the (integer)
+ *       growth factor \p bin_growth provided during construction. Unused device
+ *       allocations within a larger bin cache are not reused for allocation
+ *       requests that categorize to smaller bin sizes.
+ *       Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up
+ * to ( \p bin_growth ^ \p min_bin ).
+ *     - If two consecutive geometric bins exceed \p bin_mult_threshold and
+ *       \p bin_mult is given, a linear binning scheme is created where bins
+ *       follow \p bin_growth ^ some_bin + \p bin_mult * n
+ *     - Allocations above min( \p bin_growth ^ \p max_bin , \p
+ *       max_bin_alloc_size ) are not rounded up to the nearest bin and are simply
+ *       freed when they are deallocated instead of being returned to a bin-cache.
+ * - If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they
+ *   are deallocated instead of being returned to their bin-cache.
+ *
+ */
+struct PooledDeviceAllocator {
+
+  //---------------------------------------------------------------------
+  // Constants
+  //---------------------------------------------------------------------
+
+  /// Out-of-bounds bin
+  static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+  /// Invalid size
+  static const size_t INVALID_SIZE = (size_t)-1;
+
+  /// Invalid device ordinal
+  static const int INVALID_DEVICE_ORDINAL = -1;
+
+  //---------------------------------------------------------------------
+  // Type definitions and helper types
+  //---------------------------------------------------------------------
+
+  /**
+   * Descriptor for device memory allocations
+   */
+  struct BlockDescriptor {
+    void *d_ptr;                   // Device pointer
+    size_t bytes;                  // Size of allocation in bytes
+    size_t requested_bytes;        // Size of true allocation in bytes
+    bool binned;                   // Whether the block is part of the pool bins
+    int device;                    // device ordinal
+    gpuStream_t associated_stream; // Associated associated_stream
+    gpuEvent_t ready_event; // Signal when associated stream has run to the
+                            // point at which this block was freed
+
+    // Constructor (suitable for searching maps for a specific block, given its
+    // pointer and device)
+    BlockDescriptor(void *d_ptr, int device)
+        : d_ptr(d_ptr), bytes(0), requested_bytes(0), binned(false),
+          device(device), associated_stream(0), ready_event(0) {}
+
+    // Constructor (suitable for searching maps for a range of suitable blocks,
+    // given a device)
+    BlockDescriptor(int device)
+        : d_ptr(NULL), bytes(0), requested_bytes(0), binned(false),
+          device(device), associated_stream(0), ready_event(0) {}
+
+    // Comparison functor for comparing device pointers
+    static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+      if (a.device == b.device)
+        return (a.d_ptr < b.d_ptr);
+      else
+        return (a.device < b.device);
+    }
+
+    // Comparison functor for comparing allocation sizes
+    static bool SizeCompare(const BlockDescriptor &a,
+                            const BlockDescriptor &b) {
+      if (a.device == b.device)
+        return (a.bytes < b.bytes);
+      else
+        return (a.device < b.device);
+    }
+  };
+
+  /// BlockDescriptor comparator function interface
+  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+  class TotalBytes {
+  public:
+    size_t free;
+    size_t live;
+    TotalBytes() { free = live = 0; }
+  };
+
+  /// Set type for cached blocks (ordered by size)
+  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+  /// Set type for live blocks (ordered by ptr)
+  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+  /// Map type of device ordinals to the number of cached bytes cached by each
+  /// device
+  typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  /**
+   * Integer pow function for unsigned base and exponent
+   */
+  static unsigned int IntPow(unsigned int base, unsigned int exp) {
+    unsigned int retval = 1;
+    while (exp > 0) {
+      if (exp & 1) {
+        retval = retval * base; // multiply the result by the current base
+      }
+      base = base * base; // square the base
+      exp = exp >> 1;     // divide the exponent in half
+    }
+    return retval;
+  }
+
+  /**
+   * Round up to the nearest power-of
+   */
+  void NearestPowerOf(unsigned int &power, size_t &rounded_bytes,
+                      unsigned int base, size_t value) {
+    power = 0;
+    rounded_bytes = 1;
+
+    if (value * base < value) {
+      // Overflow
+      power = sizeof(size_t) * 8;
+      rounded_bytes = size_t(0) - 1;
+      return;
+    }
+
+    while (rounded_bytes < value) {
+      rounded_bytes *= base;
+      power++;
+    }
+  }
+
+  size_t NearestMultOf(unsigned int mult, size_t value) {
+    // Ceiling division followed by multiplication
+    return ((value + mult - 1) / mult) * mult;
+  }
+
+  unsigned int ComputeLinearBinIndex(unsigned int bin_growth,
+                                     unsigned int bin_mult_threshold) {
+    if (bin_mult_threshold == INVALID_BIN || bin_growth == 0 ||
+        bin_mult_threshold == 0)
+      return INVALID_BIN;
+
+    return static_cast<unsigned int>(std::log(bin_mult_threshold) /
+                                     std::log(bin_growth));
+  }
+
+  size_t ComputeMaxBinBytes(unsigned int bin_growth, unsigned int max_bin,
+                            size_t max_bin_alloc_size) {
+    size_t result = INVALID_SIZE;
+    if (max_bin != INVALID_BIN) {
+      result = IntPow(bin_growth, max_bin);
+    }
+    if (max_bin_alloc_size != INVALID_SIZE) {
+      result = std::min(result, max_bin_alloc_size);
+    }
+    return result;
+  }
+
+  //---------------------------------------------------------------------
+  // Fields
+  //---------------------------------------------------------------------
+
+  std::mutex mutex; /// Mutex for thread-safety
+
+  unsigned int bin_growth; /// Geometric growth factor for bin-sizes
+  unsigned int min_bin;    /// Minimum bin enumeration
+  unsigned int max_bin;    /// Maximum bin enumeration
+
+  // Extensions
+  unsigned int bin_mult_threshold; /// Threshold to switch between geometric and
+                                   /// linear growth
+  unsigned int bin_mult;           /// Linear bin scaling size
+  size_t max_bin_alloc_size;       /// Maximal binned allocation size
+  std::set<size_t> bin_sizes;      /// Explicit control over bin sizes
+
+  unsigned int linear_bin_index; /// Geometric bin to consider linear binning
+                                 /// from (computed)
+  size_t min_bin_bytes;          /// Minimum bin size
+  size_t max_bin_bytes;          /// Maximum bin size
+  size_t max_cached_bytes;       /// Maximum aggregate cached bytes per device
+
+  const bool
+      skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when
+                     /// destructor is called.  (The runtime may have already
+                     /// shut down for statically declared allocators)
+  bool debug;        /// Whether or not to print (de)allocation events to stdout
+  bool malloc_async; /// Use {cuda,hip}MallocAsync
+
+  std::set<size_t> actual_bin_sizes; /// Bin sizes used by the allocator
+  GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached
+                               /// bytes on that device
+  CachedBlocks
+      cached_blocks; /// Set of cached device allocations available for reuse
+  BusyBlocks live_blocks; /// Set of live device allocations currently in use
+
+  //---------------------------------------------------------------------
+  // Methods
+  //---------------------------------------------------------------------
+
+  /**
+   * \brief Constructor.
+   */
+  PooledDeviceAllocator(
+      unsigned int bin_growth,  ///< Geometric growth factor for bin-sizes
+      unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1)
+      unsigned int max_bin =
+          INVALID_BIN, ///< Maximum bin (default is no max bin)
+      size_t max_cached_bytes =
+          INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default
+                        ///< is no limit)
+      bool skip_cleanup =
+          false, ///< Whether or not to skip a call to \p FreeAllCached() when
+                 ///< the destructor is called (default is to deallocate)
+      bool debug = false, ///< Whether or not to print (de)allocation events to
+                          ///< stdout (default is no stderr output)
+      unsigned int bin_mult_threshold =
+          INVALID_BIN, ///< Threshold to switch between geometric and linear
+                       ///< growth
+      unsigned int bin_mult = INVALID_BIN, ///< Linear bin scaling size
+      size_t max_bin_alloc_size =
+          INVALID_SIZE,                ///< Maximal binned allocation size
+      std::set<size_t> bin_sizes = {}, ///< Explicit control over bin size
+      bool use_malloc_async = false)   ///< Use asynchronous malloc/free calls
+      : bin_growth(bin_growth), min_bin(min_bin), max_bin(max_bin),
+        bin_mult_threshold(bin_mult_threshold), bin_mult(bin_mult),
+        bin_sizes(bin_sizes),
+        linear_bin_index(ComputeLinearBinIndex(bin_growth, bin_mult_threshold)),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(
+            ComputeMaxBinBytes(bin_growth, max_bin, max_bin_alloc_size)),
+        max_cached_bytes(max_cached_bytes), skip_cleanup(skip_cleanup),
+        debug(debug), malloc_async(use_malloc_async),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare) {}
+
+  /**
+   * \brief Default constructor.
+   *
+   * Configured with:
+   * \par
+   * - \p bin_growth          = 8
+   * - \p min_bin             = 3
+   * - \p max_bin             = 7
+   * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 =
+   * 6,291,455 bytes
+   *
+   * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+   * sets a maximum of 6,291,455 cached bytes per device
+   */
+  PooledDeviceAllocator(bool skip_cleanup = false, bool debug = false)
+      : bin_growth(8), min_bin(3), max_bin(7), bin_mult_threshold(INVALID_BIN),
+        bin_mult(INVALID_BIN), bin_sizes{}, linear_bin_index(INVALID_BIN),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1), skip_cleanup(skip_cleanup),
+        debug(debug), malloc_async(false),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare) {}
+
+  /**
+   * \brief Sets the limit on the number bytes this allocator is allowed to
+   * cache per device.
+   *
+   * Changing the ceiling of cached bytes does not cause any allocations (in-use
+   * or cached-in-reserve) to be freed.  See \p FreeAllCached().
+   */
+  gpuError_t SetMaxCachedBytes(size_t max_cached_bytes_) {
+    // Lock
+    mutex.lock();
+
+    if (debug)
+      printf("Changing max_cached_bytes (%lld -> %lld)\n",
+             (long long)this->max_cached_bytes, (long long)max_cached_bytes_);
+
+    this->max_cached_bytes = max_cached_bytes_;
+
+    // Unlock
+    mutex.unlock();
+
+    return gpuSuccess;
+  }
+
+  /**
+   * \brief Implements the bin-finding algorithm described in the class
+   * documentation. Returns true if a bin was found, or false otherwise.
+   */
+  bool FindBin(BlockDescriptor &search_key) {
+    size_t bytes = search_key.requested_bytes;
+    search_key.bytes = bytes;
+
+    if (bytes > max_bin_bytes) {
+      // Size is greater than our preconfigured maximum: allocate the request
+      // exactly and give out-of-bounds bin.  It will not be cached
+      // for reuse when returned.
+      return false;
+    }
+
+    // If a custom bin histogram is given, use that
+    auto it = bin_sizes.lower_bound(bytes);
+    if (it != bin_sizes.end()) {
+      search_key.bytes = *it;
+      return true;
+    }
+
+    // Find geometric bin
+    unsigned int geobin;
+    NearestPowerOf(geobin, search_key.bytes, bin_growth, bytes);
+    // Minimum bin
+    if (geobin < min_bin) {
+      // Bin is less than minimum bin: round up
+      search_key.bytes = min_bin_bytes;
+      return true;
+    }
+
+    // Test for linear binning; if so, find linear bin
+    if (linear_bin_index != INVALID_BIN && geobin >= linear_bin_index) {
+      search_key.bytes = NearestMultOf(bin_mult, bytes);
+      return true;
+    }
+
+    // Otherwise, use geometric bin
+    if (geobin > max_bin) {
+      // Bin is greater than our maximum bin: allocate the request
+      // exactly and give out-of-bounds bin.  It will not be cached
+      // for reuse when returned.
+      return false;
+    }
+
+    // search_key.bytes was set above by NearestPowerOf
+    return true;
+  }
+
+  /**
+   * \brief Provides a suitable allocation of device memory for the given size
+   * on the specified device.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within
+   * the \p active_stream with which it was associated with during allocation,
+   * and it becomes available for reuse within other streams when all prior work
+   * submitted to \p active_stream has completed.
+   */
+  gpuError_t DeviceAllocate(
+      int device,   ///< [in] Device on which to place the allocation
+      void **d_ptr, ///< [out] Reference to pointer to the allocation
+      size_t bytes, ///< [in] Minimum number of bytes for the allocation
+      gpuStream_t active_stream =
+          0) ///< [in] The stream to be associated with this allocation
+  {
+    *d_ptr = NULL;
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    gpuError_t error = gpuSuccess;
+
+    if (device == INVALID_DEVICE_ORDINAL) {
+      if (gpuDebug(error = gpuGetDevice(&entrypoint_device)))
+        return error;
+      device = entrypoint_device;
+    }
+
+    // Create a block descriptor for the requested allocation
+    bool found = false;
+    BlockDescriptor search_key(device);
+    search_key.associated_stream = active_stream;
+    search_key.requested_bytes = bytes;
+    bool binned = FindBin(search_key);
+    search_key.binned = binned;
+
+    if (binned) {
+      // Search for a suitable cached allocation: lock
+      mutex.lock();
+
+      // Add bin size to created bin sizes
+      actual_bin_sizes.insert(search_key.bytes);
+
+      // Iterate through the range of cached blocks on the same device in the
+      // same bin
+      CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+      while ((block_itr != cached_blocks.end()) &&
+             (block_itr->device == device) &&
+             (block_itr->bytes == search_key.bytes)) {
+        // To prevent races with reusing blocks returned by the host but still
+        // in use by the device, only consider cached blocks that are
+        // either (from the active stream) or (from an idle stream)
+        bool is_reusable = false;
+        if (active_stream == block_itr->associated_stream) {
+          is_reusable = true;
+        } else {
+          const gpuError_t event_status = gpuEventQuery(block_itr->ready_event);
+          if (event_status != gpuErrorNotReady) {
+            static_cast<void>(gpuDebug(event_status));
+            is_reusable = true;
+          }
+        }
+
+        if (is_reusable) {
+          // Reuse existing cache block.  Insert into live blocks.
+          found = true;
+          search_key = *block_itr;
+          search_key.requested_bytes = bytes;
+          search_key.associated_stream = active_stream;
+          live_blocks.insert(search_key);
+
+          // Remove from free blocks
+          cached_bytes[device].free -= search_key.bytes;
+          cached_bytes[device].live += search_key.bytes;
+
+          if (debug)
+            printf("\tDevice %d reused cached block at %p (%lld bytes) for "
+                   "stream %lld (previously associated with stream %lld).\n",
+                   device, search_key.d_ptr, (long long)search_key.bytes,
+                   (long long)search_key.associated_stream,
+                   (long long)block_itr->associated_stream);
+
+          cached_blocks.erase(block_itr);
+
+          break;
+        }
+        block_itr++;
+      }
+
+      // Done searching: unlock
+      mutex.unlock();
+    }
+
+    // Allocate the block if necessary
+    if (!found) {
+      // Set runtime's current device to specified device (entrypoint may not be
+      // set)
+      if (device != entrypoint_device) {
+        if (gpuDebug(error = gpuGetDevice(&entrypoint_device)))
+          return error;
+        if (gpuDebug(error = gpuSetDevice(device)))
+          return error;
+      }
+
+      // Attempt to allocate
+      if (gpuDebug(error = MallocInternal(&search_key.d_ptr, search_key.bytes,
+                                          active_stream)) ==
+          gpuErrorMemoryAllocation) {
+        // The allocation attempt failed: free all cached blocks on device and
+        // retry
+        if (debug)
+          printf("\tDevice %d failed to allocate %lld bytes for stream %lld, "
+                 "retrying after freeing cached allocations",
+                 device, (long long)search_key.bytes,
+                 (long long)search_key.associated_stream);
+
+        error = gpuSuccess; // Reset the error we will return
+        static_cast<void>(gpuGetLastError());  // Reset error
+
+        // Lock
+        mutex.lock();
+
+        // Iterate the range of free blocks on the same device
+        BlockDescriptor free_key(device);
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+        while ((block_itr != cached_blocks.end()) &&
+               (block_itr->device == device)) {
+          // No need to worry about synchronization with the device: gpuFree is
+          // blocking and will synchronize across all kernels executing
+          // on the current device
+
+          // Free device memory and destroy stream event.
+          if (gpuDebug(error = FreeInternal(block_itr->d_ptr,
+                                            block_itr->associated_stream)))
+            break;
+          if (gpuDebug(error = gpuEventDestroy(block_itr->ready_event)))
+            break;
+
+          // Reduce balance and erase entry
+          cached_bytes[device].free -= block_itr->bytes;
+
+          if (debug)
+            printf("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks "
+                   "cached (%lld bytes), %lld live blocks (%lld bytes) "
+                   "outstanding.\n",
+                   device, (long long)block_itr->bytes,
+                   (long long)cached_blocks.size(),
+                   (long long)cached_bytes[device].free,
+                   (long long)live_blocks.size(),
+                   (long long)cached_bytes[device].live);
+
+          block_itr = cached_blocks.erase(block_itr);
+        }
+
+        // Unlock
+        mutex.unlock();
+
+        // Return under error
+        if (error)
+          return error;
+
+        // Try to allocate again
+        if (gpuDebug(error = MallocInternal(&search_key.d_ptr, search_key.bytes,
+                                            active_stream)))
+          return error;
+      }
+
+      search_key.associated_stream = active_stream;
+
+      // Create ready event
+      if (gpuDebug(error = gpuEventCreateWithFlags(&search_key.ready_event,
+                                                   gpuEventDisableTiming)))
+        return error;
+
+      // Insert into live blocks
+      mutex.lock();
+      live_blocks.insert(search_key);
+      cached_bytes[device].live += search_key.bytes;
+      mutex.unlock();
+
+      if (debug)
+        printf("\tDevice %d allocated new device block at %p (%lld bytes "
+               "associated with stream %lld).\n",
+               device, search_key.d_ptr, (long long)search_key.bytes,
+               (long long)search_key.associated_stream);
+
+      // Attempt to revert back to previous device if necessary
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) &&
+          (entrypoint_device != device)) {
+        if (gpuDebug(error = gpuSetDevice(entrypoint_device)))
+          return error;
+      }
+    }
+
+    // Copy device pointer to output parameter
+    *d_ptr = search_key.d_ptr;
+
+    if (debug)
+      printf(
+          "\t\t%lld available blocks cached (%lld bytes), %lld live blocks "
+          "outstanding(%lld bytes).\n",
+          (long long)cached_blocks.size(), (long long)cached_bytes[device].free,
+          (long long)live_blocks.size(), (long long)cached_bytes[device].live);
+
+    return error;
+  }
+
+  /**
+   * \brief Provides a suitable allocation of device memory for the given size
+   * on the current device.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within
+   * the \p active_stream with which it was associated with during allocation,
+   * and it becomes available for reuse within other streams when all prior work
+   * submitted to \p active_stream has completed.
+   */
+  gpuError_t DeviceAllocate(
+      void **d_ptr, ///< [out] Reference to pointer to the allocation
+      size_t bytes, ///< [in] Minimum number of bytes for the allocation
+      gpuStream_t active_stream =
+          0) ///< [in] The stream to be associated with this allocation
+  {
+    return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+  }
+
+  /**
+   * \brief Frees a live allocation of device memory on the specified device,
+   * returning it to the allocator.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within
+   * the \p active_stream with which it was associated with during allocation,
+   * and it becomes available for reuse within other streams when all prior work
+   * submitted to \p active_stream has completed.
+   */
+  gpuError_t DeviceFree(int device, void *d_ptr) {
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    gpuError_t error = gpuSuccess;
+
+    if (device == INVALID_DEVICE_ORDINAL) {
+      if (gpuDebug(error = gpuGetDevice(&entrypoint_device)))
+        return error;
+      device = entrypoint_device;
+    }
+
+    // Lock
+    mutex.lock();
+
+    // Find corresponding block descriptor
+    bool recached = false;
+    BlockDescriptor search_key(d_ptr, device);
+    BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+    if (block_itr != live_blocks.end()) {
+      // Remove from live blocks
+      search_key = *block_itr;
+      live_blocks.erase(block_itr);
+      cached_bytes[device].live -= search_key.bytes;
+
+      // Keep the returned allocation if bin is valid and we won't exceed the
+      // max cached threshold
+      if (search_key.binned &&
+          (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
+        // Insert returned allocation into free blocks
+        recached = true;
+        cached_blocks.insert(search_key);
+        cached_bytes[device].free += search_key.bytes;
+
+        if (debug)
+          printf("\tDevice %d returned %lld bytes from associated stream "
+                 "%lld.\n\t\t %lld available blocks cached (%lld bytes), %lld "
+                 "live blocks outstanding. (%lld bytes)\n",
+                 device, (long long)search_key.bytes,
+                 (long long)search_key.associated_stream,
+                 (long long)cached_blocks.size(),
+                 (long long)cached_bytes[device].free,
+                 (long long)live_blocks.size(),
+                 (long long)cached_bytes[device].live);
+      }
+    }
+
+    // Unlock
+    mutex.unlock();
+
+    // First set to specified device (entrypoint may not be set)
+    if (device != entrypoint_device) {
+      if (gpuDebug(error = gpuGetDevice(&entrypoint_device)))
+        return error;
+      if (gpuDebug(error = gpuSetDevice(device)))
+        return error;
+    }
+
+    if (recached) {
+      // Insert the ready event in the associated stream (must have current
+      // device set properly)
+      if (gpuDebug(error = gpuEventRecord(search_key.ready_event,
+                                          search_key.associated_stream)))
+        return error;
+    }
+
+    if (!recached) {
+      // Free the allocation from the runtime and cleanup the event.
+      if (gpuDebug(error = FreeInternal(d_ptr, search_key.associated_stream)))
+        return error;
+      if (gpuDebug(error = gpuEventDestroy(search_key.ready_event)))
+        return error;
+
+      if (debug)
+        printf("\tDevice %d freed %lld bytes from associated stream "
+               "%lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld "
+               "live blocks (%lld bytes) outstanding.\n",
+               device, (long long)search_key.bytes,
+               (long long)search_key.associated_stream,
+               (long long)cached_blocks.size(),
+               (long long)cached_bytes[device].free,
+               (long long)live_blocks.size(),
+               (long long)cached_bytes[device].live);
+    }
+
+    // Reset device
+    if ((entrypoint_device != INVALID_DEVICE_ORDINAL) &&
+        (entrypoint_device != device)) {
+      if (gpuDebug(error = gpuSetDevice(entrypoint_device)))
+        return error;
+    }
+
+    return error;
+  }
+
+  /**
+   * \brief Frees a live allocation of device memory on the current device,
+   * returning it to the allocator.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within
+   * the \p active_stream with which it was associated with during allocation,
+   * and it becomes available for reuse within other streams when all prior work
+   * submitted to \p active_stream has completed.
+   */
+  gpuError_t DeviceFree(void *d_ptr) {
+    return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+  }
+
+  /**
+   * \brief Frees all cached device allocations on all devices
+   */
+  gpuError_t FreeAllCached() {
+    gpuError_t error = gpuSuccess;
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    int current_device = INVALID_DEVICE_ORDINAL;
+
+    mutex.lock();
+
+    while (!cached_blocks.empty()) {
+      // Get first block
+      CachedBlocks::iterator begin = cached_blocks.begin();
+
+      // Get entry-point device ordinal if necessary
+      if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+        if (gpuDebug(error = gpuGetDevice(&entrypoint_device)))
+          break;
+      }
+
+      // Set current device ordinal if necessary
+      if (begin->device != current_device) {
+        if (gpuDebug(error = gpuSetDevice(begin->device)))
+          break;
+        current_device = begin->device;
+      }
+
+      // Free device memory
+      if (gpuDebug(error =
+                       FreeInternal(begin->d_ptr, begin->associated_stream)))
+        break;
+      if (gpuDebug(error = gpuEventDestroy(begin->ready_event)))
+        break;
+
+      // Reduce balance and erase entry
+      const size_t block_bytes = begin->bytes;
+      cached_bytes[current_device].free -= block_bytes;
+      cached_blocks.erase(begin);
+
+      if (debug)
+        printf(
+            "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached "
+            "(%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+            current_device, (long long)block_bytes,
+            (long long)cached_blocks.size(),
+            (long long)cached_bytes[current_device].free,
+            (long long)live_blocks.size(),
+            (long long)cached_bytes[current_device].live);
+    }
+
+    mutex.unlock();
+
+    // Attempt to revert back to entry-point device if necessary
+    if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+      if (gpuDebug(error = gpuSetDevice(entrypoint_device)))
+        return error;
+    }
+
+    return error;
+  }
+
+  /**
+   * \brief Destructor
+   */
+  virtual ~PooledDeviceAllocator() {
+    if (!skip_cleanup)
+      static_cast<void>(FreeAllCached());
+  }
+
+  /* Inspection and reporting methods */
+
+  size_t TotalAllocatedMemory(int device = INVALID_DEVICE_ORDINAL) const {
+    size_t result = 0;
+    for (auto const &[dev, totals] : cached_bytes) {
+      if (device != INVALID_DEVICE_ORDINAL && device != dev)
+        continue;
+      result += totals.live + totals.free;
+    }
+    return result;
+  }
+
+  size_t FreeMemory(int device = INVALID_DEVICE_ORDINAL) const {
+    size_t result = 0;
+    for (auto const &[dev, totals] : cached_bytes) {
+      if (device != INVALID_DEVICE_ORDINAL && device != dev)
+        continue;
+      result += totals.free;
+    }
+    return result;
+  }
+
+  size_t GetBinFreeMemory(int device = INVALID_DEVICE_ORDINAL,
+                          size_t bin_size = INVALID_SIZE) const {
+    size_t result = 0;
+    for (BlockDescriptor const &desc : cached_blocks) {
+      if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+        continue;
+      if (desc.bytes == bin_size)
+        result += desc.bytes;
+    }
+    return result;
+  }
+
+  size_t LiveMemory(int device = INVALID_DEVICE_ORDINAL) const {
+    size_t result = 0;
+    for (auto const &[dev, totals] : cached_bytes) {
+      if (device != INVALID_DEVICE_ORDINAL && device != dev)
+        continue;
+      result += totals.live;
+    }
+    return result;
+  }
+
+  size_t GetBinLiveMemory(int device = INVALID_DEVICE_ORDINAL,
+                          size_t bin_size = INVALID_SIZE) const {
+    size_t result = 0;
+    for (BlockDescriptor const &desc : live_blocks) {
+      if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+        continue;
+      if (desc.bytes == bin_size)
+        result += desc.bytes;
+    }
+    return result;
+  }
+
+  size_t NonbinnedMemory(int device = INVALID_DEVICE_ORDINAL) const {
+    size_t result = 0;
+    for (BlockDescriptor const &desc : live_blocks) {
+      if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+        continue;
+      if (!desc.binned)
+        result += desc.bytes;
+    }
+    return result;
+  }
+
+  size_t ExcessMemory(int device = INVALID_DEVICE_ORDINAL,
+                      size_t bin_size = INVALID_SIZE) const {
+    size_t result = 0;
+    for (BlockDescriptor const &desc : live_blocks) {
+      if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+        continue;
+      if (bin_size != INVALID_SIZE && desc.bytes != bin_size)
+        continue;
+      result += desc.bytes - desc.requested_bytes;
+    }
+    return result;
+  }
+
+  size_t GetNumBuffers(int device = INVALID_DEVICE_ORDINAL, bool cached = true,
+                       bool live = true) const {
+    size_t result = 0;
+    if (cached) {
+      for (BlockDescriptor const &desc : cached_blocks) {
+        if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+          continue;
+        ++result;
+      }
+    }
+    if (live) {
+      for (BlockDescriptor const &desc : live_blocks) {
+        if (device != INVALID_DEVICE_ORDINAL && desc.device != device)
+          continue;
+        ++result;
+      }
+    }
+    return result;
+  }
+
+  void Report(std::ostream &os, bool report_bins = true) const {
+    os << "Memory pool configuration:" << std::endl;
+    os << "  Geometric bins - " << bin_growth << " ^ (" << min_bin << "-"
+       << ((max_bin == INVALID_BIN) ? "inf" : std::to_string(max_bin))
+       << ")" << std::endl;
+    if (bin_mult_threshold == INVALID_BIN) {
+      os << "  Linear bins - DISABLED" << std::endl;
+    } else {
+      os << "  Linear bins - when geometric bin difference > "
+         << bin_mult_threshold << ", allocate in multiples of " << bin_mult
+         << std::endl;
+    }
+
+    if (bin_sizes.size() == 0) {
+      os << "  Custom bins - NONE" << std::endl;
+    } else {
+      os << "  Custom bins - ";
+      bool first = true;
+      for (auto const &bin : bin_sizes) {
+        if (!first)
+          os << ", ";
+        HumanReadableSize(bin, os);
+        first = false;
+      }
+      os << std::endl;
+    }
+    os << "  mallocAsync: " << (malloc_async ? "enabled" : "disabled")
+       << ", debug: " << (debug ? "enabled" : "disabled")
+       << ", skip cleanup: " << (skip_cleanup ? "yes" : "no") << std::endl;
+
+    for (auto const &[dev, totals] : cached_bytes) {
+      os << "Memory pool allocation report (Device " << dev
+         << "):" << std::endl;
+      os << "  Allocated memory: ";
+      HumanReadableSize(totals.live + totals.free, os);
+      os << " (Live: ";
+      HumanReadableSize(totals.live, os);
+      os << ", free: ";
+      HumanReadableSize(totals.free, os);
+      os << "). Buffers: " << GetNumBuffers() << std::endl;
+      os << "  Total excess memory due to binning: ";
+      HumanReadableSize(ExcessMemory(dev), os);
+      os << std::endl;
+
+      if (report_bins) {
+        os << "  Detailed bin report:" << std::endl;
+        for (auto const &bin : actual_bin_sizes) {
+          os << "    ";
+          HumanReadableSize(bin, os);
+          os << ": Live = ";
+          HumanReadableSize(GetBinLiveMemory(dev, bin), os);
+          os << ", Free = ";
+          HumanReadableSize(GetBinFreeMemory(dev, bin), os);
+          os << ", Excess = ";
+          HumanReadableSize(ExcessMemory(dev, bin), os);
+          os << std::endl;
+        }
+        os << "    Non-binned: ";
+        HumanReadableSize(NonbinnedMemory(dev), os);
+        os << std::endl;
+      }
+    }
+  }
+
+private:
+  gpuError_t MallocInternal(void **ptr, size_t size,
+                            gpuStream_t active_stream) {
+    if (malloc_async) {
+      return gpuMallocAsync(ptr, size, active_stream);
+    } else {
+      return gpuMalloc(ptr, size);
+    }
+  }
+
+  gpuError_t FreeInternal(void *ptr, gpuStream_t active_stream) {
+    if (malloc_async) {
+      return gpuFreeAsync(ptr, active_stream);
+    } else {
+      return gpuFree(ptr);
+    }
+  }
+};
+
+} // namespace hydrogen
+
+#undef gpuMallocAsync
+#undef gpuMalloc
+#undef gpuFreeAsync
+#undef gpuFree
+#undef gpuSetDevice
+#undef gpuGetDevice
+#undef gpuEventQuery
+#undef gpuGetLastError
+#undef gpuEventDestroy
+#undef gpuEventCreateWithFlags
+#undef gpuEventDisableTiming
+#undef gpuEventRecord
+#undef gpuGetErrorString
+#undef gpuStream_t
+#undef gpuEvent_t
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuErrorMemoryAllocation
+#undef GPU_ARCH
+
+#endif // HYDROGEN_POOLALLOCATOR_HPP_
diff --git a/include/hydrogen/SynchronizeAPI.hpp b/include/hydrogen/SynchronizeAPI.hpp
index 339bcf355e..b9261018cd 100644
--- a/include/hydrogen/SynchronizeAPI.hpp
+++ b/include/hydrogen/SynchronizeAPI.hpp
@@ -9,33 +9,42 @@ namespace hydrogen
 // This synchronizes the additional SyncInfos to the "master". That
 // is, the execution streams described by the "others" will wait
 // for the "master" stream.
-template <Device D, Device... Ds>
-void AddSynchronizationPoint(
-    SyncInfo<D> const& master,
-    SyncInfo<Ds> const&... others)
+template <Device D, Device D2, Device... Ds>
+void AddSynchronizationPoint(SyncInfo<D> const &master,
+                             SyncInfo<D2> const &other,
+                             SyncInfo<Ds> const &...others)
 {
-    AddSynchronizationPoint(master);
+#ifdef HYDROGEN_HAVE_GPU
+    if constexpr (D == Device::GPU && D == D2) {
+        // When the streams are the same, there is no need to create
+        // synchronization points. Skip "other" call recursively with the rest.
+        if (master.Stream() == other.Stream())
+        {
+            if constexpr (sizeof...(others) > 0UL)
+                AddSynchronizationPoint(master, others...);
+            return;
+        }
+    }
+#endif // HYDROGEN_HAVE_GPU
 
-    int dummy[] = { (details::AddSyncPoint(master, others), 0)... };
-    (void) dummy;
+    AddSynchronizationPoint(master);
+    int dummy[] = {(details::AddSyncPoint(master, other), 0),
+                   (details::AddSyncPoint(master, others), 0)...};
+    (void)dummy;
 }
 
 template <Device D, Device... Ds>
-void AllWaitOnMaster(
-    SyncInfo<D> const& master, SyncInfo<Ds> const&... others)
+void AllWaitOnMaster(SyncInfo<D> const &master, SyncInfo<Ds> const &...others)
 {
     AddSynchronizationPoint(master, others...);
 }
 
 template <Device D, Device... Ds>
-void MasterWaitOnAll(
-    SyncInfo<D> const& master,
-    SyncInfo<Ds> const&... others)
+void MasterWaitOnAll(SyncInfo<D> const &master, SyncInfo<Ds> const &...others)
 {
-    int dummy[] = {
-        (AddSynchronizationPoint(others, master), 0)...};
-    (void) dummy;
+    int dummy[] = {(AddSynchronizationPoint(others, master), 0)...};
+    (void)dummy;
 }
 
-}// namespace hydrogen
+} // namespace hydrogen
 #endif // HYDROGEN_SYNCHRONIZEAPI_HPP_
diff --git a/include/hydrogen/blas/GPU_BLAS_impl.hpp b/include/hydrogen/blas/GPU_BLAS_impl.hpp
index e2f14acd28..3d48277654 100644
--- a/include/hydrogen/blas/GPU_BLAS_impl.hpp
+++ b/include/hydrogen/blas/GPU_BLAS_impl.hpp
@@ -48,6 +48,25 @@ namespace gpu_lapack_impl = hydrogen::cusolver;
 namespace gpu_blas_impl = hydrogen::rocblas;
 namespace gpu_lapack_impl = hydrogen::rocsolver;
 
+#if defined(HYDROGEN_GPU_USE_FP16)
+#if defined(HYDROGEN_HAVE_HALF)
+template <>
+struct hydrogen::Caster<__half, rocblas_half>
+{
+    static rocblas_half Cast(__half const& x)
+    {
+        return *(reinterpret_cast<rocblas_half const*>(&x));
+    }
+};
+#else
+template <>
+inline rocblas_half hydrogen::To<rocblas_half, __half>(__half const& x)
+{
+    return *(reinterpret_cast<rocblas_half const*>(&x));
+}
+#endif
+#endif
+
 #else
 #pragma GCC error "LOGIC ERROR: No GPU programming model enabled."
 #endif
@@ -76,13 +95,14 @@ void AxpyImpl(SizeT size, T const& alpha,
               T* Y, SizeT incy,
               SyncInfo<Device::GPU> const& si)
 {
-    using NTP = MakePointer<NativeType<T>>;
-    using CNTP = MakePointerToConst<NativeType<T>>;
+    using NT = NativeType<T>;
+    using NTP = MakePointer<NT>;
+    using CNTP = MakePointerToConst<NT>;
 
     SyncManager mgr(GetLibraryHandle(), si);
     gpu_blas_impl::Axpy(
         GetLibraryHandle(),
-        ToSizeT(size), alpha,
+        ToSizeT(size), hydrogen::To<NT>(alpha),
         reinterpret_cast<CNTP>(X), ToSizeT(incx),
         reinterpret_cast<NTP>(Y), ToSizeT(incy));
 }
@@ -406,8 +426,9 @@ void GemmImpl(
     T* C, SizeT ldc,
     SyncInfo<Device::GPU> const& si)
 {
-    using NTP = MakePointer<NativeType<T>>;
-    using CNTP = MakePointerToConst<NativeType<T>>;
+    using NT = NativeType<T>;
+    using NTP = MakePointer<NT>;
+    using CNTP = MakePointerToConst<NT>;
 
     SyncManager mgr(GetLibraryHandle(), si);
     gpu_blas_impl::Gemm(
@@ -415,10 +436,10 @@ void GemmImpl(
         ToNativeTransposeMode(transA),
         ToNativeTransposeMode(transB),
         ToSizeT(m), ToSizeT(n), ToSizeT(k),
-        alpha,
+        To<NT>(alpha),
         reinterpret_cast<CNTP>(A), ToSizeT(lda),
         reinterpret_cast<CNTP>(B), ToSizeT(ldb),
-        beta,
+        To<NT>(beta),
         reinterpret_cast<NTP>(C), ToSizeT(ldc));
 }
 
@@ -444,10 +465,10 @@ void GemmStridedBatchedImpl(
         ToNativeTransposeMode(transpA),
         ToNativeTransposeMode(transpB),
         ToSizeT(m), ToSizeT(n), ToSizeT(k),
-        &alpha,
+        reinterpret_cast<CNTP>(&alpha),
         reinterpret_cast<CNTP>(A), ToSizeT(lda), ToSizeT(strideA),
         reinterpret_cast<CNTP>(B), ToSizeT(ldb), ToSizeT(strideB),
-        &beta,
+        reinterpret_cast<CNTP>(&beta),
         reinterpret_cast<NTP>(C), ToSizeT(ldc), ToSizeT(strideC),
         ToSizeT(batchCount));
 }
@@ -660,8 +681,9 @@ void GemvImpl(
     T* y, SizeT incy,
     SyncInfo<Device::GPU> const& si)
 {
-    using NTP = MakePointer<NativeType<T>>;
-    using CNTP = MakePointerToConst<NativeType<T>>;
+    using NT = NativeType<T>;
+    using NTP = MakePointer<NT>;
+    using CNTP = MakePointerToConst<NT>;
 
     if (incy != SizeT(1))
         throw std::runtime_error("incy must be 1 right now. "
@@ -682,10 +704,10 @@ void GemvImpl(
         GetLibraryHandle(),
         ToNativeTransposeMode(ATrans), ToNativeTransposeMode(BTrans),
         ToSizeT(m), ToSizeT(n), ToSizeT(k),
-        alpha,
+        To<NT>(alpha),
         reinterpret_cast<CNTP>(A), ToSizeT(lda),
         reinterpret_cast<CNTP>(x), ToSizeT(LDB),
-        beta,
+        To<NT>(beta),
         reinterpret_cast<NTP>(y), ToSizeT(LDC));
 }
 
diff --git a/include/hydrogen/device/gpu/CUB.hpp b/include/hydrogen/device/gpu/CUB.hpp
index 75f10ad167..a328948ec5 100644
--- a/include/hydrogen/device/gpu/CUB.hpp
+++ b/include/hydrogen/device/gpu/CUB.hpp
@@ -2,23 +2,12 @@
 #define HYDROGEN_IMPORTS_CUB_HPP_
 
 #include "El/hydrogen_config.h"
-
-#ifdef HYDROGEN_HAVE_CUDA
-#include <cuda_runtime.h>
-#include <cub/util_allocator.cuh>
-#elif defined HYDROGEN_HAVE_ROCM
-#include <hipcub/hipcub.hpp>
-#endif // HYDROGEN_HAVE_CUB
+#include "hydrogen/PoolAllocator.hpp"
 
 namespace hydrogen
 {
 namespace cub
 {
-#ifdef HYDROGEN_HAVE_CUDA
-namespace cub_impl = ::cub;
-#elif defined HYDROGEN_HAVE_ROCM
-namespace cub_impl = ::hipcub;
-#endif // HYDROGEN_HAVE_CUDA
 
     /** @brief Get singleton instance of CUB memory pool.
      *
@@ -26,19 +15,30 @@ namespace cub_impl = ::hipcub;
      *  already. The following environment variables are used to
      *  control the construction of the memory pool:
      *
-     *    - H_CUB_BIN_GROWTH: The growth factor. (Default: 2)
-     *    - H_CUB_MIN_BIN: The minimum bin. (Default: 1)
-     *    - H_CUB_MAX_BIN: The maximum bin. (Default: no max bin)
-     *    - H_CUB_MAX_CACHED_SIZE: The maximum aggregate cached bytes
+     *    - H_MEMPOOL_BIN_GROWTH: The growth factor. (Default: 2)
+     *    - H_MEMPOOL_MIN_BIN: The minimum bin. (Default: 1)
+     *    - H_MEMPOOL_MAX_BIN: The maximum bin. (Default: no max bin)
+     *    - H_MEMPOOL_MAX_CACHED_SIZE: The maximum aggregate cached bytes
      *      per device. (Default: No maximum)
-     *    - H_CUB_DEBUG: If nonzero, allow CUB to print debugging output.
+     *    - H_MEMPOOL_MAX_BIN_ALLOC_SIZE: The maximum cached bytes per
+     *      allocation. (Default: No maximum)
+     *    - H_MEMPOOL_BIN_MULT_THRESHOLD: The difference between two consecutive
+     *      geometric bins from which linear binning is used. (Default: No
+     *      linear binning)
+     *    - H_MEMPOOL_BIN_MULT: The multiplier for linear bin sizes. (Default:
+     *      No linear binning)
+     *    - H_MEMPOOL_BIN_SIZES: Custom set of (comma-separated) bin sizes in
+     *      bytes. (Default: None)
+     *    - H_MEMPOOL_MALLOCASYNC: If nonzero, uses mallocAsync as the backend
+     *      for non-binned allocation. (Default: 0)
+     *    - H_MEMPOOL_DEBUG: If nonzero, prints debugging output.
      *
      *  Note that if debugging output is turned on, there is no
      *  synchronization across processes. Users should take care to
      *  redirect output on a per-rank basis, either through the
      *  features exposed by their MPI launcher or by some other means.
      */
-    cub_impl::CachingDeviceAllocator& MemoryPool();
+    PooledDeviceAllocator& MemoryPool();
     /** Destroy singleton instance of CUB memory pool. */
     void DestroyMemoryPool();
 
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp
index 7e1328ff51..5899b16416 100644
--- a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp
@@ -6,7 +6,7 @@
 #include <hydrogen/Error.hpp>
 #include <hydrogen/device/gpu/GPUError.hpp>
 
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 // Helper error-checking macro.
 #define H_CHECK_ROCBLAS(cmd)                                            \
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
index 6141694dab..27b3bb90f6 100644
--- a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
@@ -7,7 +7,7 @@
 #include <hydrogen/SyncInfo.hpp>
 
 #include <hip/hip_runtime.h>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace hydrogen
 {
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
index e4838dd8ef..3316834b48 100644
--- a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
@@ -7,7 +7,7 @@
 #include <hydrogen/meta/MetaUtilities.hpp>
 #include <hydrogen/utils/HalfPrecision.hpp>
 
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace hydrogen
 {
@@ -41,14 +41,9 @@ struct NativeTypeT<std::complex<double>> { using type = rocblas_double_complex;
 
 // Half precision requires conversion as well
 #ifdef HYDROGEN_GPU_USE_FP16
-template <> struct NativeTypeT<rocblas_half> { using type = rocblas_half; };
+template <> struct NativeTypeT<gpu_half_type> { using type = rocblas_half; };
 #ifdef HYDROGEN_HAVE_HALF
 template <> struct NativeTypeT<cpu_half_type> { using type = rocblas_half; };
-template <>
-struct NativeTypeT<std::complex<cpu_half_type>>
-{
-    using type = rocblas_half_complex;
-};
 #endif // HYDROGEN_HAVE_HALF
 #endif // HYDROGEN_GPU_USE_FP16
 
@@ -118,10 +113,18 @@ struct IsSupportedType_Base<rocblas_double_complex, BLAS_Op::TRSM>
 template <>
 struct IsSupportedType_Base<rocblas_half, BLAS_Op::AXPY> : std::true_type {};
 template <>
+struct IsSupportedType_Base<rocblas_half, BLAS_Op::DOT> : std::true_type {};
+template <>
 struct IsSupportedType_Base<rocblas_half, BLAS_Op::GEMM> : std::true_type {};
 template <>
 struct IsSupportedType_Base<rocblas_half, BLAS_Op::GEMMSTRIDEDBATCHED>
     : std::true_type {};
+// support via rocblas_nrm2_ex
+template <>
+struct IsSupportedType_Base<rocblas_half, BLAS_Op::NRM2> : std::true_type {};
+// support via rocblas_scal_ex
+template <>
+struct IsSupportedType_Base<rocblas_half, BLAS_Op::SCAL> : std::true_type {};
 #endif // HYDROGEN_GPU_USE_FP16
 
 /** @class IsSupportedType
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
index 319b287635..fb09d48e68 100644
--- a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
@@ -3,7 +3,7 @@
 
 #include <El/hydrogen_config.h>
 
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace hydrogen
 {
diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
index 8dd788050a..ec17f0f631 100644
--- a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
@@ -3,7 +3,7 @@
 
 #include <El/hydrogen_config.h>
 
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace hydrogen
 {
@@ -51,9 +51,15 @@ ADD_AXPY_DECL(double);
 ADD_COPY_DECL(float);
 ADD_COPY_DECL(double);
 
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_DOT_DECL(rocblas_half);
+#endif // HYDROGEN_GPU_USE_FP16
 ADD_DOT_DECL(float);
 ADD_DOT_DECL(double);
 
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_NRM2_DECL(rocblas_half);
+#endif // HYDROGEN_GPU_USE_FP16
 ADD_NRM2_DECL(float);
 ADD_NRM2_DECL(double);
 
diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp
index d4a37b29ff..4fdbec7e5a 100644
--- a/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocSOLVERManagement.hpp
@@ -4,7 +4,7 @@
 #include <El/hydrogen_config.h>
 
 #include "rocBLASManagement.hpp"
-#include <rocsolver.h>
+#include <rocsolver/rocsolver.h>
 
 namespace hydrogen
 {
diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp
index cc5c648523..3c1b5cea24 100644
--- a/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocSOLVERMeta.hpp
@@ -7,7 +7,7 @@
 #include <hydrogen/meta/MetaUtilities.hpp>
 #include <hydrogen/utils/HalfPrecision.hpp>
 
-#include <rocsolver.h>
+#include <rocsolver/rocsolver.h>
 
 #include "rocBLASMeta.hpp"
 
diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp
index 7e088fdcf8..3f47f00103 100644
--- a/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocSOLVERUtil.hpp
@@ -3,7 +3,7 @@
 
 #include <El/hydrogen_config.h>
 
-#include <rocsolver.h>
+#include <rocsolver/rocsolver.h>
 
 namespace hydrogen
 {
diff --git a/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp b/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp
index 8315d8b9f1..d0b8199216 100644
--- a/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp
+++ b/include/hydrogen/device/gpu/rocm/rocSOLVER_API.hpp
@@ -3,7 +3,7 @@
 
 #include <El/hydrogen_config.h>
 
-#include <rocsolver.h>
+#include <rocsolver/rocsolver.h>
 
 namespace hydrogen {
 namespace rocsolver {
diff --git a/include/hydrogen/utils/HalfPrecision.hpp b/include/hydrogen/utils/HalfPrecision.hpp
index 72c9c7918f..123b1287ea 100644
--- a/include/hydrogen/utils/HalfPrecision.hpp
+++ b/include/hydrogen/utils/HalfPrecision.hpp
@@ -46,7 +46,11 @@ struct is_arithmetic<half_float::half> : true_type {};
 #endif
 
 // Now include the actual Half library.
+#if __has_include(<half/half.hpp>) // E.g., the one that ships with ROCm
+#include <half/half.hpp>
+#else
 #include <half.hpp>
+#endif
 
 // Declare the hydrogen typedef
 namespace hydrogen
@@ -94,19 +98,24 @@ inline hydrogen::cpu_half_type operator^(hydrogen::cpu_half_type const&,
 #endif // HYDROGEN_HAVE_HALF
 
 // Finally, do the GPU stuff
-#ifdef HYDROGEN_GPU_USE_FP16
+#if defined HYDROGEN_HAVE_GPU && defined HYDROGEN_GPU_USE_FP16
 
 // Grab the right header
 #if defined(HYDROGEN_HAVE_CUDA)
 #include <cuda_fp16.h>
-#elif defined(HYDROGEN_HAVE_AMDGPU)
-#include <rocblas-types.h>
+// Needed for CUDA_VERSION
+#include <cuda.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hip/hip_fp16.h>
 #endif // HYDROGEN_HAVE_CUDA
 
 namespace hydrogen
 {
 
-#if defined(HYDROGEN_HAVE_CUDA)
+#if defined(__HIP_NO_HALF_CONVERSIONS__)
+static_assert(false, "__HIP_NO_HALF_CONVERSIONS__ is defined.");
+#endif
+
 /** @brief Unified name for the FP16 type on GPU */
 using gpu_half_type = __half;
 
@@ -118,14 +127,9 @@ struct TypeTraits<gpu_half_type>
     static std::string Name() { return typeid(gpu_half_type).name(); }
 };// struct TypeTraits<gpu_half_type>
 
-#elif defined(HYDROGEN_HAVE_AMDGPU)
-/** @brief Unified name for the FP16 type on GPU */
-using gpu_half_type = rocblas_half;
-#endif // HYDROGEN_HAVE_CUDA
-
 }// namespace hydrogen
 
-#if defined(HYDROGEN_HAVE_CUDA) && !defined(__CUDACC__)
+#if defined(HYDROGEN_HAVE_ROCM) || (defined(HYDROGEN_HAVE_CUDA) && !(defined(__CUDACC__)) && (CUDA_VERSION < 12020))
 
 /** @brief Enable "update" functionality for __half. */
 template <typename T>
@@ -198,12 +202,49 @@ inline hydrogen::gpu_half_type operator-(
     return -float(val);
 }
 
-#endif // defined(HYDROGEN_HAVE_CUDA) && !defined(__CUDACC__)
+#if defined(HYDROGEN_HAVE_ROCM)
+inline bool operator<(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return float(x) < float(y);
+}
+
+inline bool operator>(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return float(x) > float(y);
+}
+
+inline bool operator<=(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return float(x) <= float(y);
+}
+
+inline bool operator>=(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return float(x) >= float(y);
+}
+
+inline bool operator==(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return float(x) == float(y);
+}
+
+inline bool operator!=(
+    hydrogen::gpu_half_type const& x, hydrogen::gpu_half_type const& y)
+{
+    return !(x == y);
+}
+#endif // defined(HYDROGEN_HAVE_ROCM)
+#endif // defined(HYDROGEN_HAVE_ROCM) || (defined(HYDROGEN_HAVE_CUDA) && !(defined(__CUDACC__)) && (CUDA_VERSION < 12020))
 
 inline std::ostream& operator<<(std::ostream& os, hydrogen::gpu_half_type const& x)
 {
     return os << float(x) << "_h";
 }
 
-#endif // HYDROGEN_GPU_USE_FP16
+#endif // defined HYDROGEN_HAVE_GPU && defined HYDROGEN_GPU_USE_FP16
 #endif // HYDROGEN_UTILS_HALFPRECISION_HPP_
diff --git a/include/hydrogen/utils/NumericTypeConversion.hpp b/include/hydrogen/utils/NumericTypeConversion.hpp
index b30f4c3e32..aafddc0e8e 100644
--- a/include/hydrogen/utils/NumericTypeConversion.hpp
+++ b/include/hydrogen/utils/NumericTypeConversion.hpp
@@ -15,7 +15,11 @@
 */
 
 #ifdef HYDROGEN_HAVE_HALF
+#if __has_include(<half/half.hpp>) // E.g., the one that ships with ROCm
+#include <half/half.hpp>
+#else
 #include <half.hpp>
+#endif
 #endif // HYDROGEN_HAVE_HALF
 
 namespace hydrogen
@@ -59,6 +63,17 @@ struct Caster<F, __half>
         return static_cast<__half>(x);
     }
 };
+
+#ifdef HYDROGEN_HAVE_ROCM
+template <>
+struct Caster<__half, double>
+{
+    static double Cast(__half const& x)
+    {
+        return float(x);
+    }
+};
+#endif // HYDROGEN_HAVE_ROCM
 #endif // HYDROGEN_GPU_USE_FP16
 
 template <typename T, typename F>
diff --git a/src/core/MemoryPool.cpp b/src/core/MemoryPool.cpp
index 1b5058f651..3e81fd70e4 100644
--- a/src/core/MemoryPool.cpp
+++ b/src/core/MemoryPool.cpp
@@ -4,6 +4,29 @@
 
 namespace El
 {
+bool details::debug_mempool() noexcept
+{
+    char const* const env = std::getenv("H_MEMPOOL_DEBUG");
+    return (env && std::strlen(env) && env[0] != '0');
+}
+
+float details::default_mempool_bin_growth() noexcept
+{
+    char const* const env = std::getenv("H_MEMPOOL_BIN_GROWTH");
+    return (env ? std::stof(env) : 1.6);
+}
+
+size_t details::default_mempool_min_bin() noexcept
+{
+    char const* const env = std::getenv("H_MEMPOOL_MIN_BIN");
+    return (env ? std::stoull(env) : 1UL);
+}
+
+size_t details::default_mempool_max_bin() noexcept
+{
+    char const* const env = std::getenv("H_MEMPOOL_MAX_BIN");
+    return (env ? std::stoull(env) : (1 << 26));
+}
 
 namespace
 {
diff --git a/src/core/Profiling.cpp b/src/core/Profiling.cpp
index 72d40349b1..27fa5323b5 100644
--- a/src/core/Profiling.cpp
+++ b/src/core/Profiling.cpp
@@ -1,5 +1,6 @@
 #include <array>
 #include <atomic>
+#include <stdexcept>
 
 #include "El/hydrogen_config.h"
 #include "El/core/Profiling.hpp"
@@ -11,6 +12,10 @@
 #include "cuda_runtime.h"
 #endif // HYDROGEN_HAVE_NVPROF
 
+#ifdef HYDROGEN_HAVE_ROCTRACER
+#include "roctx.h"
+#endif
+
 #ifdef HYDROGEN_HAVE_VTUNE
 #include <ittnotify.h>
 #endif // HYDROGEN_HAVE_VTUNE
@@ -56,6 +61,13 @@ bool nvprof_runtime_enabled = true;
 bool NVProfRuntimeEnabled() noexcept { return nvprof_runtime_enabled; }
 #endif
 
+// Some variables for roctx
+#ifdef HYDROGEN_HAVE_ROCTRACER
+bool roctx_runtime_enabled = true;
+
+bool roctxRuntimeEnabled() noexcept { return roctx_runtime_enabled; }
+#endif
+
 }// namespace <anon>
 
 void EnableVTune() noexcept
@@ -72,6 +84,13 @@ void EnableNVProf() noexcept
 #endif // HYDROGEN_HAVE_NVPROF
 }
 
+void EnableROCTX() noexcept
+{
+#ifdef HYDROGEN_HAVE_ROCTRACER
+    roctx_runtime_enabled = true;
+#endif // HYDROGEN_HAVE_ROCTRACER
+}
+
 void DisableVTune() noexcept
 {
 #ifdef HYDROGEN_HAVE_VTUNE
@@ -86,6 +105,13 @@ void DisableNVProf() noexcept
 #endif // HYDROGEN_HAVE_NVPROF
 }
 
+void DisableROCTX() noexcept
+{
+#ifdef HYDROGEN_HAVE_ROCTRACER
+    roctx_runtime_enabled = false;
+#endif // HYDROGEN_HAVE_ROCTRACER
+}
+
 Color GetNextProfilingColor() noexcept
 {
     auto id = current_color.fetch_add(1, std::memory_order_relaxed);
@@ -94,6 +120,11 @@ Color GetNextProfilingColor() noexcept
 
 void BeginRegionProfile(char const* s, Color c) noexcept
 {
+#ifdef HYDROGEN_HAVE_ROCTRACER
+    if (roctxRuntimeEnabled())
+        roctxRangePush(s);
+#endif // HYDROGEN_HAVE_ROCTRACER
+
 #ifdef HYDROGEN_HAVE_NVPROF
     if (NVProfRuntimeEnabled())
     {
@@ -130,6 +161,11 @@ void BeginRegionProfile(char const* s, Color c) noexcept
 
 void EndRegionProfile(const char *) noexcept
 {
+#ifdef HYDROGEN_HAVE_ROCTRACER
+    if (roctxRuntimeEnabled())
+        roctxRangePop();
+#endif // HYDROGEN_HAVE_ROCTRACER
+
 #ifdef HYDROGEN_HAVE_NVPROF
     if (NVProfRuntimeEnabled())
         nvtxRangePop();
diff --git a/src/core/environment.cpp b/src/core/environment.cpp
index fc0db9f499..da06003e70 100644
--- a/src/core/environment.cpp
+++ b/src/core/environment.cpp
@@ -341,6 +341,11 @@ void Finalize()
 
     if( mpi::Finalized() )
         cerr << "Warning: MPI was finalized before Elemental." << endl;
+
+    // Reset the global comms
+    const_cast<El::mpi::Comm&>(El::mpi::COMM_SELF).Reset();
+    const_cast<El::mpi::Comm&>(El::mpi::COMM_WORLD).Reset();
+
     if( ::numElemInits == 0 )
     {
         delete ::args;
diff --git a/src/core/imports/cub.cpp b/src/core/imports/cub.cpp
index 61fb2ca6bc..60164ff932 100644
--- a/src/core/imports/cub.cpp
+++ b/src/core/imports/cub.cpp
@@ -1,75 +1,93 @@
 #include "hydrogen/device/gpu/CUB.hpp"
 
 #include <memory>
+#include <set>
+#include <sstream>
 
-namespace hydrogen
-{
-namespace cub
-{
-namespace
-{
-
-unsigned int get_env_uint(char const* env_var_name,
-                          unsigned int default_value = 0U) noexcept
-{
-    char const* env = std::getenv(env_var_name);
-    return (env
-            ? static_cast<unsigned>(std::stoi(env))
-            : default_value);
+namespace hydrogen {
+namespace cub {
+namespace {
+
+unsigned int get_env_uint(char const *env_var_name,
+                          unsigned int default_value = 0U) noexcept {
+  char const *env = std::getenv(env_var_name);
+  return (env ? static_cast<unsigned>(std::stoi(env)) : default_value);
+}
+
+unsigned int get_bin_growth() noexcept {
+  return get_env_uint("H_MEMPOOL_BIN_GROWTH", 2U);
+}
+
+unsigned int get_min_bin() noexcept {
+  return get_env_uint("H_MEMPOOL_MIN_BIN", 1U);
+}
+
+unsigned int get_max_bin() noexcept {
+  return get_env_uint("H_MEMPOOL_MAX_BIN", PooledDeviceAllocator::INVALID_BIN);
 }
 
-unsigned int get_bin_growth() noexcept
-{
-    return get_env_uint("H_CUB_BIN_GROWTH", 2U);
+size_t get_max_cached_size() noexcept {
+  char const *env = std::getenv("H_MEMPOOL_MAX_CACHED_SIZE");
+  return (env ? static_cast<size_t>(std::stoul(env))
+              : PooledDeviceAllocator::INVALID_SIZE);
 }
 
-unsigned int get_min_bin() noexcept
-{
-    return get_env_uint("H_CUB_MIN_BIN", 1U);
+size_t get_max_bin_alloc_size() noexcept {
+  char const *env = std::getenv("H_MEMPOOL_MAX_BIN_ALLOC_SIZE");
+  return (env ? static_cast<size_t>(std::stoul(env))
+              : PooledDeviceAllocator::INVALID_SIZE);
 }
 
-unsigned int get_max_bin() noexcept
-{
-    return get_env_uint("H_CUB_MAX_BIN",
-                        cub_impl::CachingDeviceAllocator::INVALID_BIN);
+unsigned int get_bin_mult_threshold() noexcept {
+  return get_env_uint("H_MEMPOOL_BIN_MULT_THRESHOLD",
+                      PooledDeviceAllocator::INVALID_BIN);
 }
 
-size_t get_max_cached_size() noexcept
-{
-    char const* env = std::getenv("H_CUB_MAX_CACHED_SIZE");
-    return (env
-            ? static_cast<size_t>(std::stoul(env))
-            : cub_impl::CachingDeviceAllocator::INVALID_SIZE);
+unsigned int get_bin_mult() noexcept {
+  return get_env_uint("H_MEMPOOL_BIN_MULT", PooledDeviceAllocator::INVALID_BIN);
 }
 
-bool get_debug() noexcept
-{
-    char const* env = std::getenv("H_CUB_DEBUG");
-    return (env
-            ? static_cast<bool>(std::stoi(env))
-            : false);
+bool get_debug() noexcept {
+  char const *env = std::getenv("H_MEMPOOL_DEBUG");
+  return (env ? static_cast<bool>(std::stoi(env)) : false);
+}
+
+bool get_malloc_async() noexcept {
+  char const *env = std::getenv("H_MEMPOOL_MALLOCASYNC");
+  return (env ? static_cast<bool>(std::stoi(env)) : false);
+}
+
+std::set<size_t> get_bin_sizes() noexcept {
+  std::set<size_t> result;
+  char const *env = std::getenv("H_MEMPOOL_BIN_SIZES");
+  if (!env)
+    return result;
+
+  std::string envstr{env};
+  std::istringstream iss{envstr};
+  std::string elem;
+  while(std::getline(iss, elem, ',')) {
+    result.insert(std::stoull(elem));
+  }
+  return result;
 }
 
 /** Singleton instance of CUB memory pool. */
-std::unique_ptr<cub_impl::CachingDeviceAllocator> memoryPool_;
-} // namespace <anon>
-
-cub_impl::CachingDeviceAllocator& MemoryPool()
-{
-    if (!memoryPool_)
-        memoryPool_.reset(
-            new cub_impl::CachingDeviceAllocator(
-                get_bin_growth(),
-                get_min_bin(),
-                get_max_bin(),
-                get_max_cached_size(),
-                /*skip_cleanup=*/false,
-                get_debug()));
-    return *memoryPool_;
+std::unique_ptr<PooledDeviceAllocator> memoryPool_;
+} // namespace
+
+PooledDeviceAllocator &MemoryPool() {
+  if (!memoryPool_)
+    memoryPool_.reset(new PooledDeviceAllocator(
+        get_bin_growth(), get_min_bin(), get_max_bin(), get_max_cached_size(),
+        /*skip_cleanup=*/false, get_debug(), get_bin_mult_threshold(),
+        get_bin_mult(), get_max_bin_alloc_size(), get_bin_sizes(),
+        get_malloc_async()));
+
+  return *memoryPool_;
 }
 
-void DestroyMemoryPool()
-{ memoryPool_.reset(); }
+void DestroyMemoryPool() { memoryPool_.reset(); }
 
-} // namespace CUBMemoryPool
+} // namespace cub
 } // namespace hydrogen
diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp
index 9d724f516e..a05385d758 100644
--- a/src/core/imports/mpi.cpp
+++ b/src/core/imports/mpi.cpp
@@ -595,15 +595,24 @@ EL_NO_RELEASE_EXCEPT
 {
     EL_DEBUG_CSE;
 
+    if constexpr (IsAluminumSupported<Real, D, Collective::SEND>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<Real, D, Collective::SEND>;
+        Al::Send<BE>(buf, count, to, comm.template GetComm<BE>(syncInfo));
+    }
+    else
+    {
 #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS
-    ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo);
+        ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo);
 #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS
 
-    Synchronize(syncInfo);
+        Synchronize(syncInfo);
 
-    EL_CHECK_MPI_CALL(
-        MPI_Send(
-            buf, count, TypeMap<Real>(), to, tag, comm.GetMPIComm()));
+        EL_CHECK_MPI_CALL(
+            MPI_Send(
+                buf, count, TypeMap<Real>(), to, tag, comm.GetMPIComm()));
+    }
 }
 
 template <typename Real, Device D,
@@ -614,21 +623,36 @@ void TaggedSend(
 EL_NO_RELEASE_EXCEPT
 {
     EL_DEBUG_CSE;
-
+    using CT = std::complex<Real>;
+    if constexpr (IsAluminumSupported<CT, D, Collective::SEND>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<CT, D, Collective::SEND>;
+        Al::Send<BE>(buf, count, to, comm.template GetComm<BE>(syncInfo));
+    }
+    else if constexpr (IsAluminumSupported<Real, D, Collective::SEND>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<Real, D, Collective::SEND>;
+        Al::Send<BE>((Real*)buf, 2*count, to,
+                 comm.template GetComm<BE>(syncInfo));
+    }
+    else {
 #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS
-    ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo);
+        ENSURE_HOST_SEND_BUFFER(buf, count, syncInfo);
 #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS
 
-    Synchronize(syncInfo);
+        Synchronize(syncInfo);
 #ifdef EL_AVOID_COMPLEX_MPI
-    EL_CHECK_MPI_CALL(
-        MPI_Send(
-            buf, 2*count, TypeMap<Real>(), to, tag, comm.GetMPIComm()));
+        EL_CHECK_MPI_CALL(
+            MPI_Send(
+                buf, 2*count, TypeMap<Real>(), to, tag, comm.GetMPIComm()));
 #else
-    EL_CHECK_MPI_CALL(
-        MPI_Send(
-            buf, count, TypeMap<Complex<Real>>(), to, tag, comm.GetMPIComm()));
+        EL_CHECK_MPI_CALL(
+            MPI_Send(
+                buf, count, TypeMap<Complex<Real>>(), to, tag, comm.GetMPIComm()));
 #endif
+    }
 }
 
 template <typename T, Device D,
@@ -864,15 +888,24 @@ void TaggedRecv( Real* buf, int count, int from, int tag, Comm const& comm,
 {
     EL_DEBUG_CSE;
 
+    if constexpr (IsAluminumSupported<Real, D, Collective::RECV>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<Real, D, Collective::RECV>;
+        Al::Recv<BE>(buf, count, from, comm.template GetComm<BE>(syncInfo));
+    }
+    else
+    {
 #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS
-    ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo);
+        ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo);
 #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS
 
-    Synchronize(syncInfo);
-    Status status;
-    EL_CHECK_MPI_CALL(
-        MPI_Recv(
-            buf, count, TypeMap<Real>(), from, tag, comm.GetMPIComm(), &status));
+        Synchronize(syncInfo);
+        Status status;
+        EL_CHECK_MPI_CALL(
+            MPI_Recv(
+                buf, count, TypeMap<Real>(), from, tag, comm.GetMPIComm(), &status));
+    }
 }
 
 template <typename Real, Device D,
@@ -882,24 +915,41 @@ void TaggedRecv( Complex<Real>* buf, int count, int from, int tag, Comm const& c
 EL_NO_RELEASE_EXCEPT
 {
     EL_DEBUG_CSE;
+    using CT = std::complex<Real>;
 
+    if constexpr (IsAluminumSupported<CT, D, Collective::RECV>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<CT, D, Collective::RECV>;
+        Al::Recv<BE>(buf, count, from, comm.template GetComm<BE>(syncInfo));
+    }
+    else if constexpr (IsAluminumSupported<Real, D, Collective::RECV>::value)
+    {
+        (void) tag; // Al doesn't use tags.
+        using BE = BestBackend<Real, D, Collective::RECV>;
+        Al::Recv<BE>((Real*) buf, 2*count, from,
+                     comm.template GetComm<BE>(syncInfo));
+    }
+    else
+    {
 #ifdef HYDROGEN_ENSURE_HOST_MPI_BUFFERS
-    ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo);
+        ENSURE_HOST_RECV_BUFFER(buf, count, syncInfo);
 #endif // HYDROGEN_ENSURE_HOST_MPI_BUFFERS
 
-    Synchronize(syncInfo);
+        Synchronize(syncInfo);
 
-    Status status;
+        Status status;
 #ifdef EL_AVOID_COMPLEX_MPI
-    EL_CHECK_MPI_CALL(
-        MPI_Recv(
-            buf, 2*count, TypeMap<Real>(), from, tag, comm.GetMPIComm(), &status));
+        EL_CHECK_MPI_CALL(
+            MPI_Recv(
+                buf, 2*count, TypeMap<Real>(), from, tag, comm.GetMPIComm(), &status));
 #else
-    EL_CHECK_MPI_CALL(
-        MPI_Recv(
-            buf, count, TypeMap<Complex<Real>>(),
-            from, tag, comm.GetMPIComm(), &status));
+        EL_CHECK_MPI_CALL(
+            MPI_Recv(
+                buf, count, TypeMap<Complex<Real>>(),
+                from, tag, comm.GetMPIComm(), &status));
 #endif
+    }
 }
 
 template <typename T, Device D,
diff --git a/src/core/imports/mpi/SendRecv.hpp b/src/core/imports/mpi/SendRecv.hpp
index 75326bf87a..872ee93729 100644
--- a/src/core/imports/mpi/SendRecv.hpp
+++ b/src/core/imports/mpi/SendRecv.hpp
@@ -20,18 +20,67 @@ void SendRecv(const T* sbuf, int sc, int to,
         comm.template GetComm<Backend>(syncInfo));
 }
 
+namespace {
+
+template <typename B> struct BackendTag {};
+
+template <typename T>
+void do_copy_n(T const* src, int count, T* dst, SyncInfo<Device::CPU> const&)
+{
+    std::copy_n(src, count, dst);
+}
+
+#ifdef HYDROGEN_HAVE_GPU
+template <typename T>
+void do_copy_n(T const* src, int count, T* dst, SyncInfo<Device::GPU> const& si)
+{
+    gpu::Copy1DIntraDevice(src, dst, count, si);
+}
+#endif // HYDROGEN_HAVE_GPU
+
+template <typename T, Device D, typename Backend>
+void SafeInPlaceSendRecv(T* buf, int count, int to, int from, Comm const& comm,
+                         SyncInfo<D> const& syncInfo, BackendTag<Backend>)
+{
+    hydrogen::simple_buffer<T, D> tmp_recv_buf(count, syncInfo);
+    Al::SendRecv<Backend>(
+        buf, count, to, tmp_recv_buf.data(), count, from,
+        comm.template GetComm<Backend>(syncInfo));
+    do_copy_n(tmp_recv_buf.data(), count, buf, syncInfo);
+}
+
+#ifdef HYDROGEN_HAVE_AL_HOST_XFER
+template <typename T, Device D>
+void SafeInPlaceSendRecv(T* buf, int count, int to, int from, Comm const& comm,
+                         SyncInfo<D> const& syncInfo,
+                         BackendTag<Al::HostTransferBackend>)
+{
+    // This is ok due to implementation details of how the
+    // HostTransfer backend works.
+    using Backend = Al::HostTransferBackend;
+    Al::SendRecv<Backend>(
+        buf, count, to, buf, count, from,
+        comm.template GetComm<Backend>(syncInfo));
+}
+#endif
+} // namespace
+
 template <typename T, Device D,
           typename/*=EnableIf<IsAluminumSupported<T,D,COLL>>*/>
 void SendRecv(T* buf, int count, int to, int from, Comm const& comm,
               SyncInfo<D> const& syncInfo)
 {
     EL_DEBUG_CSE;
-
     using Backend = BestBackend<T,D,Collective::SENDRECV>;
-    // Not sure if Al is ok with this bit
+
+#ifdef HYDROGEN_AL_SUPPORTS_INPLACE_SENDRECV
     Al::SendRecv<Backend>(
-        buf, count, to, buf, count, from,
+        buf, count, to, from,
         comm.template GetComm<Backend>(syncInfo));
+#else
+    SafeInPlaceSendRecv(buf, count, to, from, comm, syncInfo,
+                        BackendTag<Backend>{});
+#endif
 }
 #endif // HYDROGEN_HAVE_ALUMINUM
 
diff --git a/src/hydrogen/device/GPU.cpp b/src/hydrogen/device/GPU.cpp
index 300e39eada..93b0ac25ae 100644
--- a/src/hydrogen/device/GPU.cpp
+++ b/src/hydrogen/device/GPU.cpp
@@ -40,6 +40,7 @@ int ComputeDeviceId(unsigned int device_count) noexcept
     // TODO: Use HWLOC or something to pick "closest GPU"
     int local_rank = 0;
     char* env = nullptr;
+    if (!env) { env = std::getenv("FLUX_TASK_LOCAL_ID"); }
     if (!env) { env = std::getenv("SLURM_LOCALID"); }
     if (!env) { env = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); }
     if (!env) { env = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); }
diff --git a/src/hydrogen/device/ROCm.cpp b/src/hydrogen/device/ROCm.cpp
index db1592184c..19f37d270a 100644
--- a/src/hydrogen/device/ROCm.cpp
+++ b/src/hydrogen/device/ROCm.cpp
@@ -75,7 +75,13 @@ hipStream_t GetNewStream()
 hipEvent_t GetNewEvent()
 {
     hipEvent_t event;
+#if HIP_VERSION < 50600000
     H_CHECK_HIP(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#else
+    H_CHECK_HIP(hipEventCreateWithFlags(
+                    &event,
+                    hipEventDisableTiming | hipEventDisableSystemFence));
+#endif
     return event;
 }
 
diff --git a/src/hydrogen/device/cuBLAS_API.cpp b/src/hydrogen/device/cuBLAS_API.cpp
index fd6714bad5..59377159f8 100644
--- a/src/hydrogen/device/cuBLAS_API.cpp
+++ b/src/hydrogen/device/cuBLAS_API.cpp
@@ -167,7 +167,6 @@ using RealType = typename RealTypeT<T>::type;
 #define ADD_NRM2_IMPL(ScalarType, TypeChar)                     \
     void Nrm2(cublasHandle_t handle,                            \
               int n, ScalarType const* X, int incx,             \
-              ScalarType const* Y, int incy,                    \
               RealType<ScalarType>* output)                     \
     {                                                           \
         H_CHECK_CUBLAS(                                         \
diff --git a/src/hydrogen/device/rocBLAS_API.cpp b/src/hydrogen/device/rocBLAS_API.cpp
index 2c8fa4f783..75c5ddcdb6 100644
--- a/src/hydrogen/device/rocBLAS_API.cpp
+++ b/src/hydrogen/device/rocBLAS_API.cpp
@@ -1,14 +1,112 @@
 #include <hydrogen/device/gpu/rocm/rocBLAS_API.hpp>
 
+#include <hydrogen/device/gpu/CUB.hpp>
 #include <hydrogen/device/gpu/ROCm.hpp>
 #include <hydrogen/device/gpu/rocm/rocBLAS.hpp>
 
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 namespace hydrogen
 {
 namespace rocblas
 {
+namespace
+{
+/** @brief Manage host-pointer result semantics.
+ *
+ *  If the memory mode on the handle is already set to DEVICE, this is
+ *  just a passthrough. Otherwise, we grab memory from CUB (fallback
+ *  to the HIP stream-aware allocator) and use that.
+ */
+template <typename T>
+class ResultMgr
+{
+public:
+    ResultMgr(rocblas_handle handle, T* result_ptr)
+    : handle_{handle},
+      result_{result_ptr},
+      device_{nullptr}
+    {
+        rocblas_pointer_mode current_ptr_mode;
+        H_CHECK_ROCBLAS(rocblas_get_pointer_mode(handle_, &current_ptr_mode));
+        if (current_ptr_mode == rocblas_pointer_mode_host)
+        {
+            hipStream_t stream;
+            H_CHECK_ROCBLAS(rocblas_get_stream(handle_, &stream));
+#ifdef HYDROGEN_HAVE_CUB
+            H_CHECK_HIP(cub::MemoryPool().DeviceAllocate(
+                            reinterpret_cast<void**>(&device_),
+                            sizeof(T),
+                            stream));
+#else
+            H_CHECK_HIP(hipMallocAsync(
+                            reinterpret_cast<void**>(&device_),
+                            sizeof(T),
+                            stream));
+#endif // HYDROGEN_HAVE_CUB
+
+            // Now set the pointer mode
+            H_CHECK_ROCBLAS(
+                rocblas_set_pointer_mode(handle_,
+                                         rocblas_pointer_mode_device));
+        }
+    }
+
+    ~ResultMgr()
+    {
+        try
+        {
+            if (device_)
+            {
+                hipStream_t stream;
+                H_CHECK_ROCBLAS(rocblas_get_stream(handle_, &stream));
+
+                // Copy asyncly to host
+                H_CHECK_HIP(hipMemcpyAsync(result_,
+                                           device_,
+                                           sizeof(T),
+                                           hipMemcpyDeviceToHost,
+                                           stream));
+
+                // Clean up device memory
+#ifdef HYDROGEN_HAVE_CUB
+                H_CHECK_HIP(cub::MemoryPool().DeviceFree(device_));
+#else
+                H_CHECK_HIP(hipFreeAsync(device_, stream));
+#endif // HYDROGEN_HAVE_CUB
+
+                // Sync stream to match cuBLAS behavior (cuBLAS docs here:
+                // https://docs.nvidia.com/cuda/cublas/#scalar-parameters)
+                H_CHECK_HIP(hipStreamSynchronize(stream));
+
+                // Reset pointer mode
+                H_CHECK_ROCBLAS(rocblas_set_pointer_mode(handle_, rocblas_pointer_mode_host));
+            }
+        }
+        catch (std::exception const& e)
+        {
+            std::cerr << "Caught exception in dtor:\n\n  " << e.what() << "\n\nTerminating."
+                      << std::endl;
+            std::terminate();
+        }
+    }
+
+    T* get() noexcept { return (device_ ? device_ : result_); }
+private:
+    rocblas_handle handle_ = nullptr;
+    T* result_ = nullptr;
+    T* device_ = nullptr;
+}; // struct ResultMgr
+
+template <typename T>
+auto manage_result(rocblas_handle handle, T* result_ptr)
+{
+    if (!result_ptr)
+        throw GPUError("result_ptr cannot be null");
+    return ResultMgr<T>{handle, result_ptr};
+};
+
+}
 
 //
 // BLAS 1
@@ -42,23 +140,25 @@ namespace rocblas
              rocblas_int n,                             \
              ScalarType const* X, rocblas_int incx,     \
              ScalarType const* Y, rocblas_int incy,     \
-             ScalarType* result)                        \
+             ScalarType* result_in)                     \
     {                                                   \
+        auto result = manage_result(handle, result_in); \
         H_CHECK_ROCBLAS(                                \
             rocblas_ ## TypeChar ## dot(                \
                 handle,                                 \
-                n, X, incx, Y, incy, result));          \
+                n, X, incx, Y, incy, result.get()));    \
     }
 
 #define ADD_NRM2_IMPL(ScalarType, TypeChar)                             \
     void Nrm2(rocblas_handle handle,                                    \
               rocblas_int n, ScalarType const* X, rocblas_int incx,     \
-              ScalarType* result)                                       \
+              ScalarType* result_in)                                    \
     {                                                                   \
+        auto result = manage_result(handle, result_in);                 \
         H_CHECK_ROCBLAS(                                                \
             rocblas_ ## TypeChar ## nrm2(                               \
                 handle,                                                 \
-                n, X, incx, result));                                   \
+                n, X, incx, result.get()));                             \
     }
 
 #define ADD_SCALE_IMPL(ScalarType, TypeChar)            \
@@ -243,14 +343,49 @@ ADD_AXPY_IMPL(double, d)
 ADD_COPY_IMPL(float, s)
 ADD_COPY_IMPL(double, d)
 
-//ADD_DOT_IMPL(rocblas_half, h)
+ADD_DOT_IMPL(rocblas_half, h)
 ADD_DOT_IMPL(float, s)
 ADD_DOT_IMPL(double, d)
 
-//ADD_NRM2_IMPL(rocblas_half, h)
+// ADD_NRM2_IMPL(rocblas_half, h)
+void Nrm2(rocblas_handle handle,
+          rocblas_int n,
+          rocblas_half const* X,
+          rocblas_int incx,
+          rocblas_half* result_in)
+{
+    auto result = manage_result(handle, result_in);
+    H_CHECK_ROCBLAS(
+        rocblas_nrm2_ex(
+            handle,
+            n,
+            X,
+            rocblas_datatype_f16_r,
+            incx,
+            result.get(),
+            rocblas_datatype_f16_r,
+            rocblas_datatype_f32_r));
+}
+
 ADD_NRM2_IMPL(float, s)
 ADD_NRM2_IMPL(double, d)
 
+void Scale(rocblas_handle handle,
+           rocblas_int n, rocblas_half const& alpha,
+           rocblas_half* X, rocblas_int incx)
+{
+    H_CHECK_ROCBLAS(
+        rocblas_scal_ex(
+            handle,
+            n,
+            &alpha,
+            rocblas_datatype_f16_r,
+            X,
+            rocblas_datatype_f16_r,
+            incx,
+            rocblas_datatype_f32_r));
+}
+
 ADD_SCALE_IMPL(float, s)
 ADD_SCALE_IMPL(double, d)
 
@@ -320,21 +455,21 @@ ASSERT_SUPPORT(double, BLAS_Op::NRM2);
 ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED);
 
 #ifdef HYDROGEN_GPU_USE_FP16
-ASSERT_SUPPORT(rocblas_half, BLAS_Op::AXPY);
-ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMM);
-ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::SCAL);
-ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::COPY);
-ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::DGMM);
-ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEAM);
-ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEMV);
-ASSERT_SUPPORT(rocblas_half, BLAS_Op::DOT);
-ASSERT_SUPPORT(rocblas_half, BLAS_Op::NRM2);
-ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMMSTRIDEDBATCHED);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::AXPY);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::GEMM);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::COPY);
+ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::DGMM);
+ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::GEAM);
+ASSERT_NO_SUPPORT(gpu_half_type, BLAS_Op::GEMV);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::DOT);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::NRM2);
+ASSERT_SUPPORT(gpu_half_type, BLAS_Op::GEMMSTRIDEDBATCHED);
 
 #ifdef HYDROGEN_HAVE_HALF
 ASSERT_SUPPORT(cpu_half_type, BLAS_Op::AXPY);
 ASSERT_SUPPORT(cpu_half_type, BLAS_Op::GEMM);
-ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::SCAL);
+ASSERT_SUPPORT(cpu_half_type, BLAS_Op::SCAL);
 ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::COPY);
 ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::DGMM);
 ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEAM);
diff --git a/src/hydrogen/device/rocSOLVER_API.cpp b/src/hydrogen/device/rocSOLVER_API.cpp
index ad57073294..229edadf8c 100644
--- a/src/hydrogen/device/rocSOLVER_API.cpp
+++ b/src/hydrogen/device/rocSOLVER_API.cpp
@@ -1,7 +1,7 @@
 #include <hydrogen/device/gpu/ROCm.hpp>
 #include <hydrogen/device/gpu/rocm/rocSOLVER.hpp>
 
-#include <rocsolver.h>
+#include <rocsolver/rocsolver.h>
 
 namespace hydrogen {
 namespace rocsolver {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 43dff24736..e9cc78c2ee 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,13 +11,20 @@ foreach (src_file ${SOURCES})
   add_executable("${__test_name}" ${src_file})
   target_link_libraries("${__test_name}" PRIVATE ${HYDROGEN_LIBRARIES})
 
+  # FIXME: https://github.com/llnl/elemental/issues/159
+  if (__test_name MATCHES "DifferentGridsGeneral")
+    continue()
+  endif ()
+    
   # Create test
   add_test(NAME "${__test_name}.test"
     COMMAND "${__test_name}")
 
-  add_test(NAME "${__test_name}_mpi_np4.test"
-    COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPI_PREFLAGS}
-    $<TARGET_FILE:${__test_name}> ${MPI_POSTFLAGS})
+  if (MPIEXEC)
+    add_test(NAME "${__test_name}_mpi_np4.test"
+      COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPI_PREFLAGS}
+      $<TARGET_FILE:${__test_name}> ${MPI_POSTFLAGS})
+  endif ()
 endforeach ()
 
 set_target_properties(HermitianEig
diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
index 14dc5d4ca6..f1fc0a1bb6 100644
--- a/unit_test/CMakeLists.txt
+++ b/unit_test/CMakeLists.txt
@@ -7,6 +7,7 @@ if (HYDROGEN_HAVE_GPU)
     gpu_combine_test.cu
     gpu_entrywise_map_test.cu
     copy_cpu_gpu_test.cpp
+    pool_allocator_test.cpp
     )
   if (HYDROGEN_HAVE_HALF AND HYDROGEN_GPU_USE_FP16)
     list(APPEND HYDROGEN_CATCH2_TEST_FILES
diff --git a/unit_test/pool_allocator_test.cpp b/unit_test/pool_allocator_test.cpp
new file mode 100644
index 0000000000..eb3c3521b4
--- /dev/null
+++ b/unit_test/pool_allocator_test.cpp
@@ -0,0 +1,75 @@
+#include <catch2/catch.hpp>
+
+#include "El.hpp"
+#include "hydrogen/PoolAllocator.hpp"
+
+TEST_CASE("Testing hydrogen::PooledDeviceAllocator", "[memory][utils][gpu]") {
+  SECTION("Basic pool behavior and geometric binning") {
+    hydrogen::PooledDeviceAllocator alloc{/*bin_growth=*/2,
+                                          /*min_bin=*/1, /*max_bin=*/20};
+    void *ptr;
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 201);
+    REQUIRE(alloc.TotalAllocatedMemory() == 256 * sizeof(float));
+    CHECK(alloc.ExcessMemory() == 55 * sizeof(float));
+    alloc.DeviceFree(ptr);
+    CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float));
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 203);
+    CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float));
+    alloc.DeviceFree(ptr);
+
+    // Unbinned memory
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 1048576);
+    CHECK(alloc.TotalAllocatedMemory() == (256 + 1048576) * sizeof(float));
+    alloc.DeviceFree(ptr);
+    CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float));
+  }
+  SECTION("Linear binning test") {
+    hydrogen::PooledDeviceAllocator alloc{
+        /*bin_growth=*/2,
+        /*min_bin=*/1,
+        /*max_bin=*/20,
+        /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE,
+        /*skip_cleanup=*/false,
+        /*debug=*/false,
+        /*bin_mult_threshold=*/1024 * sizeof(float),
+        /*bin_mult=*/6,
+        /*max_binned_alloc_size=*/4007 * sizeof(float)};
+
+    void *ptr;
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 201);
+    CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float));
+    alloc.DeviceFree(ptr);
+
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 3001);
+    CHECK(alloc.TotalAllocatedMemory() == 256 * sizeof(float) + 12006);
+    alloc.DeviceFree(ptr);
+
+    alloc.DeviceAllocate(-1, &ptr, sizeof(float) * 4009);
+    CHECK(alloc.TotalAllocatedMemory() ==
+          ((256 + 4009) * sizeof(float) + 12006));
+    alloc.DeviceFree(ptr);
+    CHECK(alloc.TotalAllocatedMemory() == (256 * sizeof(float) + 12006));
+  }
+  SECTION("Custom binning test") {
+    hydrogen::PooledDeviceAllocator alloc{
+        /*bin_growth=*/2,
+        /*min_bin=*/1,
+        /*max_bin=*/20,
+        /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE,
+        /*skip_cleanup=*/false,
+        /*debug=*/false,
+        /*bin_mult_threshold=*/hydrogen::PooledDeviceAllocator::INVALID_BIN,
+        /*bin_mult=*/hydrogen::PooledDeviceAllocator::INVALID_BIN,
+        /*max_cached_size=*/hydrogen::PooledDeviceAllocator::INVALID_SIZE,
+        /*bin_sizes=*/{1, 21, 39, 100}};
+
+    void *ptr;
+    alloc.DeviceAllocate(-1, &ptr, 32);
+    CHECK(alloc.TotalAllocatedMemory() == 39);
+    alloc.DeviceFree(ptr);
+
+    alloc.DeviceAllocate(-1, &ptr, 201);
+    CHECK(alloc.TotalAllocatedMemory() == (39 + 256));
+    alloc.DeviceFree(ptr);
+  }
+}