Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/KokkosComm/collective.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@

#include "fwd.hpp"
#include "reduction_op.hpp"
#if defined(KOKKOSCOMM_ENABLE_MPI)
#include "mpi/mpi_space.hpp"
#include "mpi/handle.hpp"
#include "mpi/req.hpp"
#include "mpi/broadcast.hpp"
#endif
#if defined(KOKKOSCOMM_ENABLE_NCCL)
#include "nccl/nccl_space.hpp"
#include "nccl/handle.hpp"
Expand Down
16 changes: 14 additions & 2 deletions src/KokkosComm/mpi/broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

#include "impl/error_handling.hpp"

namespace KokkosComm::mpi {
namespace KokkosComm {
namespace mpi {

template <KokkosExecutionSpace ExecSpace, KokkosView View>
auto ibroadcast(const ExecSpace& space, View& v, int root, MPI_Comm comm) -> Req<MpiSpace> {
Expand Down Expand Up @@ -56,4 +57,15 @@ void broadcast(ExecSpace const& space, View const& v, int root, MPI_Comm comm) {
Kokkos::Tools::popRegion();
}

} // namespace KokkosComm::mpi
} // namespace mpi
namespace Experimental::Impl {

template <KokkosView View, KokkosExecutionSpace ExecSpace>
struct Broadcast<View, ExecSpace, MpiSpace> {
static auto execute(Handle<ExecSpace, MpiSpace>& h, View& v, int root) -> Req<MpiSpace> {
return KokkosComm::mpi::ibroadcast(h.space(), v, root, h.comm());
}
};

} // namespace Experimental::Impl
} // namespace KokkosComm
2 changes: 1 addition & 1 deletion src/KokkosComm/nccl/broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ auto broadcast(const Kokkos::Cuda& space, View& v, int root, ncclComm_t comm) ->

Req<NcclSpace> req{space.cuda_stream()};
if (KC::is_contiguous(v)) {
ncclBcast(KC::data_handle(v), KC::span(v), datatype<NccSpace, T>, root, comm, space.cuda_stream());
ncclBcast(KC::data_handle(v), KC::span(v), datatype<NcclSpace, T>(), root, comm, space.cuda_stream());
} else {
Kokkos::abort("KokkosComm::Experimental::nccl::broadcast: unimplemented for non-contiguous views");
}
Expand Down
90 changes: 90 additions & 0 deletions unit_tests/test_broadcast.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <KokkosComm/KokkosComm.hpp>

#if defined(KOKKOSCOMM_ENABLE_NCCL)
#include "nccl/utils.hpp"
#endif

namespace {

template <typename T>
class Broadcast : public testing::Test {
public:
using Scalar = T;
};
#if defined(KOKKOSCOMM_ENABLE_NCCL)
using ScalarTypes = testing::Types<float, double, int, int64_t>;
#else
using ScalarTypes =
testing::Types<float, double, Kokkos::complex<float>, Kokkos::complex<double>, int, unsigned, int64_t, size_t>;
#endif
TYPED_TEST_SUITE(Broadcast, ScalarTypes);

template <typename Scalar>
auto broadcast_0d() -> void {
#if defined(KOKKOSCOMM_ENABLE_NCCL)
using ExecSpace = Kokkos::Cuda;
auto nccl_ctx = test_utils::nccl::Ctx::init();
KokkosComm::Handle<ExecSpace, KokkosComm::Nccl> h(ExecSpace(), nccl_ctx.comm());
#else
using ExecSpace = Kokkos::DefaultExecutionSpace;
KokkosComm::Handle<Kokkos::DefaultExecutionSpace, KokkosComm::Mpi> h{};
#endif
int rank = h.rank();
int size = h.size();
int root = 0;

Kokkos::View<Scalar> v("v");
if (rank == root) {
// Prepare broadcast view
Kokkos::parallel_for(
Kokkos::RangePolicy(ExecSpace(), 0, v.extent(0)), KOKKOS_LAMBDA(const int) { v() = size; });
}
// Using the same execution space for both operations lets us not need an explicit `fence`
auto req = KokkosComm::Experimental::broadcast(h, v, root);
KokkosComm::wait(req);

int errs;
Kokkos::parallel_reduce(
v.extent(0), KOKKOS_LAMBDA(const int, int& lsum) { lsum += v() != size; }, errs);
EXPECT_EQ(errs, 0);
}

template <typename Scalar>
auto broadcast_contig_1d() -> void {
#if defined(KOKKOSCOMM_ENABLE_NCCL)
using ExecSpace = Kokkos::Cuda;
auto nccl_ctx = test_utils::nccl::Ctx::init();
KokkosComm::Handle<ExecSpace, KokkosComm::Nccl> h(ExecSpace(), nccl_ctx.comm());
#else
using ExecSpace = Kokkos::DefaultExecutionSpace;
KokkosComm::Handle<Kokkos::DefaultExecutionSpace, KokkosComm::Mpi> h{};
#endif
int rank = h.rank();
int size = h.size();
int root = 0;

Kokkos::View<Scalar*> v("v", 100);
if (rank == root) {
// Prepare broadcast view
Kokkos::parallel_for(
Kokkos::RangePolicy(ExecSpace(), 0, v.extent(0)), KOKKOS_LAMBDA(const int i) { v(i) = size + i; });
}
// Using the same execution space for both operations lets us not need an explicit `fence`
auto req = KokkosComm::Experimental::broadcast(h, v, root);
KokkosComm::wait(req);

int errs;
Kokkos::parallel_reduce(
v.extent(0), KOKKOS_LAMBDA(const int i, int& lsum) { lsum += (v(i) != size + i); }, errs);
EXPECT_EQ(errs, 0);
}

TYPED_TEST(Broadcast, 0D) { broadcast_0d<typename TestFixture::Scalar>(); }
TYPED_TEST(Broadcast, Contiguous1D) { broadcast_contig_1d<typename TestFixture::Scalar>(); }

} // namespace
Loading