kokkos
diff --git a/‎cmake/kc-test.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/kc-test.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎unit_tests/CMakeLists.txt‎
Lines changed: 6 additions & 6 deletions b/‎unit_tests/CMakeLists.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎unit_tests/logging.hpp‎
Lines changed: 10 additions & 10 deletions b/‎unit_tests/logging.hpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎unit_tests/nccl/test_allgather.cpp‎
Lines changed: 2 additions & 2 deletions b/‎unit_tests/nccl/test_allgather.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unit_tests/nccl/test_allreduce.cpp‎
Lines changed: 6 additions & 6 deletions b/‎unit_tests/nccl/test_allreduce.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎unit_tests/nccl/test_alltoall.cpp‎
Lines changed: 4 additions & 4 deletions b/‎unit_tests/nccl/test_alltoall.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎unit_tests/nccl/test_broadcast.cpp‎
Lines changed: 2 additions & 2 deletions b/‎unit_tests/nccl/test_broadcast.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unit_tests/nccl/test_point_to_point.cpp‎
Lines changed: 2 additions & 2 deletions b/‎unit_tests/nccl/test_point_to_point.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unit_tests/nccl/test_reduce.cpp‎
Lines changed: 1 addition & 1 deletion b/‎unit_tests/nccl/test_reduce.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unit_tests/nccl/utils.cpp‎
Lines changed: 107 additions & 0 deletions b/‎unit_tests/nccl/utils.cpp‎
Lines changed: 107 additions & 0 deletions
@@ -15,6 +15,7 @@ function(kc_add_unit_test name)
   if(UT_CORE)
     target_link_libraries(${name} PRIVATE KokkosComm::KokkosComm MPI::MPI_CXX)
     if(KokkosComm_ENABLE_NCCL)
+      target_sources(${name} PRIVATE nccl/utils.cpp)
       target_link_libraries(${name} PRIVATE NCCL::NCCL)
     endif()
   elseif(UT_MPI)
 
@@ -179,42 +179,42 @@ if(KokkosComm_ENABLE_NCCL)
     test.nccl.p2p
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_point_to_point.cpp
+    FILES test_main.cpp nccl/test_point_to_point.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
   kc_add_unit_test(
     test.nccl.broadcast
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_broadcast.cpp
+    FILES test_main.cpp nccl/test_broadcast.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
   kc_add_unit_test(
     test.nccl.all-gather
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_allgather.cpp
+    FILES test_main.cpp nccl/test_allgather.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
   kc_add_unit_test(
     test.nccl.all-to-all
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_alltoall.cpp
+    FILES test_main.cpp nccl/test_alltoall.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
   kc_add_unit_test(
     test.nccl.all-reduce
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_allreduce.cpp
+    FILES test_main.cpp nccl/test_allreduce.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
   kc_add_unit_test(
     test.nccl.reduce
     NCCL
     NUM_PES 2
-    FILES test_main.cpp nccl/test_reduce.cpp
+    FILES test_main.cpp nccl/test_reduce.cpp nccl/utils.cpp
     LIBRARIES KokkosComm::KokkosComm MPI::MPI_CXX
   )
 endif()
@@ -7,12 +7,12 @@
 #include <cstdlib>
 #include <string_view>
 
-#include <cuda.h>
+#include <KokkosComm/config.hpp>
 #include <mpi.h>
 #if defined(KOKKOSCOMM_ENABLE_NCCL)
 #include <nccl.h>
+#include <cuda_runtime.h>
 #endif
-
 #include <fmt/core.h>
 
 namespace logging {
@@ -47,24 +47,24 @@ constexpr std::array level_txt{"FATAL"sv, "ERROR"sv, "WARNING"sv, "INFO"sv, "TRA
 
 #define KC_CHECK(expr, ...) ((expr) ? void(0) : KC_FATAL(__VA_ARGS__))
 
-#define KC_CUDA_CHECK(expr)                                                                                      \
-  ([&]() {                                                                                                       \
-    cudaError_t kc_res_ = (expr);                                                                                \
-    return kc_res_ == cudaSuccess ? void(0)                                                                      \
-                                  : KC_FATAL("CUDA check failed: `" #expr "`: {}", cudaGetErrorString(kc_res_)); \
-  }())
-
 #define KC_MPI_CHECK(expr)                                                                            \
   ([&]() {                                                                                            \
     int kc_res_ = (expr);                                                                             \
     return kc_res_ == MPI_SUCCESS ? void(0) : KC_FATAL("MPI check failed: `" #expr "`: {}", kc_res_); \
   }())
 
-#if defined(KOKKOSCOMM_ENABLE_NCCL)
+#ifdef KOKKOSCOMM_ENABLE_NCCL
 #define KC_NCCL_CHECK(expr)                                                                                      \
   ([&]() {                                                                                                       \
     ncclResult_t kc_res_ = (expr);                                                                               \
     return kc_res_ == ncclSuccess ? void(0)                                                                      \
                                   : KC_FATAL("NCCL check failed: `" #expr "`: {}", ncclGetErrorString(kc_res_)); \
   }())
+
+#define KC_CUDA_CHECK(expr)                                                                                      \
+  ([&]() {                                                                                                       \
+    cudaError_t kc_res_ = (expr);                                                                                \
+    return kc_res_ == cudaSuccess ? void(0)                                                                      \
+                                  : KC_FATAL("CUDA check failed: `" #expr "`: {}", cudaGetErrorString(kc_res_)); \
+  }())
 #endif
@@ -21,7 +21,7 @@ TYPED_TEST_SUITE(AllGather, ScalarTypes);
 
 template <typename Scalar>
 auto allgather_0d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
@@ -48,7 +48,7 @@ auto allgather_0d() -> void {
 
 template <typename Scalar>
 auto allgather_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
 
@@ -21,7 +21,7 @@ TYPED_TEST_SUITE(AllReduce, ScalarTypes);
 
 template <typename Scalar>
 auto allreduce_0d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
@@ -41,23 +41,23 @@ auto allreduce_0d() -> void {
 
   int errs;
   Kokkos::parallel_reduce(
-      rv.extent(0), KOKKOS_LAMBDA(const int, int &lsum) { lsum += (rv() != size * (size - 1) / 2); }, errs
+      rv.extent(0), KOKKOS_LAMBDA(const int, int& lsum) { lsum += (rv() != size * (size - 1) / 2); }, errs
   );
   EXPECT_EQ(errs, 0);
 }
 
 template <typename Scalar>
 auto allreduce_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
   const int rank  = nccl_ctx.rank();
   const int root  = 0;
 
   const int n_contrib = 10;
-  Kokkos::View<Scalar *> sv("sv", n_contrib);
-  Kokkos::View<Scalar *> rv("rv", n_contrib);
+  Kokkos::View<Scalar*> sv("sv", n_contrib);
+  Kokkos::View<Scalar*> rv("rv", n_contrib);
 
   // Prepare send buffer
   Kokkos::parallel_for(
@@ -69,7 +69,7 @@ auto allreduce_contig_1d() -> void {
 
   int errs;
   Kokkos::parallel_reduce(
-      rv.extent(0), KOKKOS_LAMBDA(const int i, int &lsum) { lsum += (rv(i) != size * (size - 1) / 2 + size * i); }, errs
+      rv.extent(0), KOKKOS_LAMBDA(const int i, int& lsum) { lsum += (rv(i) != size * (size - 1) / 2 + size * i); }, errs
   );
   EXPECT_EQ(errs, 0);
 }
 
@@ -21,16 +21,16 @@ TYPED_TEST_SUITE(AllToAll, ScalarTypes);
 
 template <typename Scalar>
 auto alltoall_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
   const int rank  = nccl_ctx.rank();
   const int root  = 0;
 
   const int n_contrib = 100;
-  Kokkos::View<Scalar *> sv("sv", size * n_contrib);
-  Kokkos::View<Scalar *> rv("rv", size * n_contrib);
+  Kokkos::View<Scalar*> sv("sv", size * n_contrib);
+  Kokkos::View<Scalar*> rv("rv", size * n_contrib);
 
   // Prepare send view
   Kokkos::parallel_for(
@@ -43,7 +43,7 @@ auto alltoall_contig_1d() -> void {
   int errs;
   Kokkos::parallel_reduce(
       rv.extent(0),
-      KOKKOS_LAMBDA(const int i, int &lsum) {
+      KOKKOS_LAMBDA(const int i, int& lsum) {
         const int src = i / n_contrib;                       // who sent this data
         const int j   = rank * n_contrib + (i % n_contrib);  // what index i was at the source
         lsum += rv(i) != src + j;
 
@@ -20,7 +20,7 @@ TYPED_TEST_SUITE(Broadcast, ScalarTypes);
 
 template <typename Scalar>
 auto broadcast_0d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
@@ -47,7 +47,7 @@ auto broadcast_0d() -> void {
 
 template <typename Scalar>
 auto broadcast_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
 
@@ -27,7 +27,7 @@ TYPED_TEST_SUITE(PointToPoint, ScalarTypes);
 
 template <typename Scalar>
 auto p2p_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
@@ -60,7 +60,7 @@ auto p2p_contig_1d() -> void {
 
 template <typename Scalar>
 auto p2p_noncontig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
 
@@ -23,7 +23,7 @@ TYPED_TEST_SUITE(Reduce, ScalarTypes);
 /// operation is sum, so recvbuf[i] should be sum(0..size) + i * size
 template <typename Scalar>
 auto reduce_contig_1d() -> void {
-  auto nccl_ctx   = test_utils::nccl::Ctx::init();
+  auto& nccl_ctx  = test_utils::NcclCtx::get();
   const auto exec = Kokkos::Cuda(nccl_ctx.stream());
   const auto comm = nccl_ctx.comm();
   const int size  = nccl_ctx.size();
 
@@ -0,0 +1,107 @@
+#include <memory>
+#include <mutex>
+
+#include <mpi.h>
+#include <nccl.h>
+#include <cuda_runtime.h>
+
+#include "utils.hpp"
+#include "../logging.hpp"
+
+namespace {
+
+[[nodiscard]] auto get_local_rank(MPI_Comm comm, int my_rank) -> int {
+  MPI_Comm node_comm;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, my_rank, MPI_INFO_NULL, &node_comm);
+
+  int node_rank;
+  MPI_Comm_rank(node_comm, &node_rank);
+
+  MPI_Comm_free(&node_comm);
+  return node_rank;
+}
+
+}  // namespace
+
+namespace test_utils {
+
+std::unique_ptr<NcclCtx> NcclCtx::instance_{};
+std::once_flag NcclCtx::init_flag_{};
+
+NcclCtx::NcclCtx(ncclComm_t comm, cudaStream_t stream, int dev, int size, int rank)
+    : comm_(comm), stream_(stream), dev_(dev), size_(size), rank_(rank) {}
+
+NcclCtx::~NcclCtx() {
+  if (stream_ != nullptr) {
+    cudaStreamDestroy(stream_);
+  }
+  if (comm_ != nullptr) {
+    ncclCommDestroy(comm_);
+  }
+}
+
+auto NcclCtx::init(bool verbose) -> void {
+  std::call_once(init_flag_, [verbose]() {
+    int flag = 0;
+    KC_MPI_CHECK(MPI_Initialized(&flag));
+    KC_CHECK(flag != 0, "MPI is not initialized");
+
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+
+    int size = 0;
+    KC_MPI_CHECK(MPI_Comm_size(mpi_comm, &size));
+    int rank = 0;
+    KC_MPI_CHECK(MPI_Comm_rank(mpi_comm, &rank));
+
+    int local_rank = get_local_rank(mpi_comm, rank);
+
+    int devs = 0;
+    KC_CUDA_CHECK(cudaGetDeviceCount(&devs));
+
+    if (verbose) {
+      KC_INFO("P{} found {} CUDA devices", rank, devs);
+    }
+
+    KC_CHECK(local_rank < devs, "P{} needs device #{} but only {} devices available", rank, local_rank, devs);
+
+    KC_CUDA_CHECK(cudaSetDevice(local_rank));
+
+    if (verbose) {
+      KC_INFO("P{} assigned to CUDA device #{}", rank, local_rank);
+    }
+
+    ncclUniqueId nccl_id{};
+    if (rank == 0) {
+      KC_NCCL_CHECK(ncclGetUniqueId(&nccl_id));
+    }
+
+    KC_MPI_CHECK(MPI_Bcast(&nccl_id, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, mpi_comm));
+
+    ncclComm_t nccl_comm = nullptr;
+    KC_NCCL_CHECK(ncclCommInitRank(&nccl_comm, size, nccl_id, rank));
+
+    cudaStream_t stream = nullptr;
+    KC_CUDA_CHECK(cudaStreamCreate(&stream));
+
+    instance_ = std::unique_ptr<NcclCtx>(new NcclCtx(nccl_comm, stream, local_rank, size, rank));
+  });
+}
+
+auto NcclCtx::fini() -> void { instance_.reset(); }
+
+auto NcclCtx::get() -> NcclCtx& {
+  KC_CHECK(instance_ != nullptr, "NCCL context not initialized");
+  return *instance_;
+}
+
+auto NcclCtx::comm() const -> ncclComm_t { return comm_; }
+
+auto NcclCtx::stream() const -> cudaStream_t { return stream_; }
+
+auto NcclCtx::size() const -> int { return size_; }
+
+auto NcclCtx::rank() const -> int { return rank_; }
+
+auto NcclCtx::device() const -> int { return dev_; }
+
+}  // namespace test_utils