Skip to content

Commit b14754c

Browse files
Merge branch 'develop' into pr/rank-supported
2 parents be7308f + 901ccc7 commit b14754c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+625
-393
lines changed

.github/workflows/snl-h100.yaml

Lines changed: 107 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,133 @@
11
name: H100
22

3-
43
permissions:
54
contents: none
65

76
on:
87
workflow_call:
98

109
jobs:
11-
PR_CUDA1250_OPENMPI504:
12-
name: PR_CUDA1250_OPENMPI504
13-
runs-on: [cuda125-openmpi504-latest-latest]
10+
PR_CUDA1262_OPENMPI505:
11+
name: PR_CUDA1262_OPENMPI505
1412

13+
# this label is correct, but the underlying AT2 runner is actually
14+
# CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
15+
runs-on: [cuda125-openmpi504-latest-latest]
1516
steps:
16-
- name: Checkout Kokkos Comm
17+
- name: Check NVIDIA GPU
18+
run: nvidia-smi
19+
20+
- name: Kokkos - Checkout
21+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
22+
with:
23+
repository: kokkos/kokkos
24+
ref: 4.7.01
25+
path: kokkos
26+
- name: Kokkos - Configure
27+
run: >
28+
cmake
29+
-S kokkos
30+
-B kokkos/build
31+
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
32+
-DCMAKE_CXX_STANDARD=20
33+
-DCMAKE_CXX_EXTENSIONS=OFF
34+
-DCMAKE_INSTALL_PREFIX=kokkos/install
35+
-DKokkos_ENABLE_CUDA=ON
36+
-DKokkos_ARCH_HOPPER90=ON
37+
-DKokkos_ENABLE_TESTS=OFF
38+
-DKokkos_ENABLE_EXAMPLES=OFF
39+
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
40+
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
41+
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
42+
- name: Kokkos - Build
43+
run: cmake --build kokkos/build --parallel $(nproc)
44+
- name: Kokkos - Install
45+
run: cmake --build kokkos/build --target install --parallel $(nproc)
46+
- name: KokkosComm - Checkout
1747
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1848
with:
1949
path: kokkos-comm
50+
- name: KokkosComm - Configure MPI backend
51+
run: >
52+
cmake
53+
-S kokkos-comm
54+
-B build-mpi
55+
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
56+
-DCMAKE_CXX_STANDARD=20
57+
-DCMAKE_CXX_EXTENSIONS=OFF
58+
-DCMAKE_CXX_FLAGS="-Werror"
59+
-DKokkos_ROOT=kokkos/install
60+
-DKokkosComm_ENABLE_MPI=ON
61+
-DKokkosComm_ENABLE_TESTS=ON
62+
-DKokkosComm_ENABLE_PERFTESTS=ON
63+
- name: KokkosComm - Build MPI backend
64+
run: cmake --build build-mpi --parallel $(nproc)
65+
- name: KokkosComm - Test MPI backend
66+
working-directory: build-mpi
67+
run: ctest --output-on-failure -V --timeout 1200
2068

21-
- name: Checkout Kokkos
69+
PR_CUDA1262_NCCL2275:
70+
name: PR_CUDA1262_NCCL2275
71+
72+
# this label is correct, but the underlying AT2 runner is actually
73+
# CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
74+
runs-on: [cuda125-openmpi504-latest-latest]
75+
steps:
76+
- name: Check NVIDIA GPU
77+
run: nvidia-smi
78+
79+
- name: Kokkos - Checkout
2280
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
2381
with:
2482
repository: kokkos/kokkos
25-
ref: 4.5.01
83+
ref: 4.7.01
2684
path: kokkos
2785

28-
- name: nvidia-smi
29-
run: nvidia-smi
86+
- name: Kokkos - Configure
87+
run: >
88+
cmake
89+
-S kokkos
90+
-B kokkos/build
91+
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
92+
-DCMAKE_CXX_STANDARD=20
93+
-DCMAKE_CXX_EXTENSIONS=OFF
94+
-DCMAKE_INSTALL_PREFIX=kokkos/install
95+
-DKokkos_ENABLE_CUDA=ON
96+
-DKokkos_ARCH_HOPPER90=ON
97+
-DKokkos_ENABLE_TESTS=OFF
98+
-DKokkos_ENABLE_EXAMPLES=OFF
99+
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
100+
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
101+
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
102+
- name: Kokkos - Build
103+
run: cmake --build kokkos/build --parallel $(nproc)
30104

31-
- name: Configure Kokkos
32-
run: |
33-
cmake -S kokkos -B kokkos/build \
34-
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
35-
-DCMAKE_CXX_STANDARD=20 \
36-
-DCMAKE_CXX_EXTENSIONS=OFF \
37-
-DCMAKE_INSTALL_PREFIX=kokkos/install \
38-
-DKokkos_ENABLE_CUDA=ON \
39-
-DKokkos_ARCH_HOPPER90=ON \
40-
-DKokkos_ENABLE_TESTS=OFF \
41-
-DKokkos_ENABLE_EXAMPLES=OFF \
42-
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
43-
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
44-
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
45-
46-
- name: Build Kokkos
105+
- name: Kokkos - Install
47106
run: cmake --build kokkos/build --target install --parallel $(nproc)
48107

49-
- name: Configure Kokkos Comm
50-
run: |
51-
cmake -S kokkos-comm -B build \
52-
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
53-
-DCMAKE_CXX_STANDARD=20 \
54-
-DCMAKE_CXX_EXTENSIONS=OFF \
55-
-DCMAKE_CXX_FLAGS="-Werror" \
56-
-DKokkos_ROOT=kokkos/install \
57-
-DKokkosComm_ENABLE_TESTS=ON \
58-
-DKokkosComm_ENABLE_PERFTESTS=ON
108+
- name: KokkosComm - Checkout
109+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
110+
with:
111+
path: kokkos-comm
112+
113+
- name: KokkosComm - Configure NCCL backend
114+
# FIXME_NCCL: no performance tests on NCCL
115+
run: >
116+
cmake
117+
-S kokkos-comm
118+
-B build-nccl
119+
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
120+
-DCMAKE_CXX_STANDARD=20
121+
-DCMAKE_CXX_EXTENSIONS=OFF
122+
-DKokkos_ROOT=kokkos/install
123+
-DKokkosComm_ENABLE_MPI=OFF
124+
-DKokkosComm_ENABLE_NCCL=ON
125+
-DKokkosComm_ENABLE_TESTS=ON
126+
-DKokkosComm_ENABLE_PERFTESTS=OFF
59127
60-
- name: Build Kokkos Comm
61-
run: cmake --build build --parallel $(nproc)
128+
- name: KokkosComm - Build NCCL backend
129+
run: cmake --build build-nccl --parallel $(nproc)
62130

63-
- name: Test Kokkos Comm
64-
working-directory: build
131+
- name: KokkosComm - Test NCCL backend
132+
working-directory: build-nccl
65133
run: ctest --output-on-failure -V --timeout 1200

docs/design/nccl_interop.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
*********************
2+
NCCL interoperability
3+
*********************
4+
5+
There are several challenges with supporting NCCL.
6+
7+
* For multi-process NCCL, we need a way to share a unique ID between processes so that the different processes know they're part of the same NCCL communicator. For the time being, this is accomplished via MPI in the nccl tests.
8+
9+
* NCCL has the concept of a non-blocking communicator. This causes all NCCL operations to potentially return ``ncclInProgress`` BEFORE they actually put an GPU operations into streams. This means we can't just synchronize on a CUDA stream in the NCCL backend's ``wait`` implementation. Either:
10+
11+
* our NCCL operations need to effectively become blocking (checking NCCL's async status thing until it's no longer in progress)
12+
* our ``wait`` implementation needs to do that
13+
14+
* It is not guaranteed to be safe for NCCL operations and GPU-aware MPI operations to be simultaneously active on the same set of GPUs. This is a challenge both if we permit multiple backends to exist, and for interop with existing MPI and/or NCCL applications.

docs/dev/impl_comm_space.rst

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@ For example, for the MPI communication space, we define the following:
2020
2121
namespace KokkosComm {
2222
23-
struct Mpi {
24-
static auto world_size() noexcept -> int { /* ... */ }
25-
static auto world_rank() noexcept -> int { /* ... */ }
23+
struct MpiSpace {
24+
...
2625
};
2726
2827
template <>
@@ -31,7 +30,6 @@ For example, for the MPI communication space, we define the following:
3130
} // end KokkosComm
3231
3332
34-
Notice that ``Mpi`` has two static methods, but that these methods are not required. The main point is that ``struct Mpi`` exists.
3533
To let core API functions know that your communication space is something KokkosComm can use to dispatch messages, you also need to declare the ``Impl::is_communication_space`` specialization using the ``CommunicationSpace`` concept.
3634

3735

@@ -71,7 +69,7 @@ For example, for the MPI communication space request, we define the following:
7169
namespace KokkosComm {
7270
7371
template <>
74-
class Req<Mpi> { /* ... */ };
72+
class Req<MpiSpace> { /* ... */ };
7573
7674
} // end KokkosComm
7775
@@ -103,7 +101,7 @@ For example, for the MPI communication space, we create a partial specialization
103101
namespace KokkosComm::Impl {
104102
105103
template <KokkosView RecvView, KokkosExecutionSpace ExecSpace>
106-
struct Recv<RecvView, ExecSpace, Mpi> { /* ... */ };
104+
struct Recv<RecvView, ExecSpace, MpiSpace> { /* ... */ };
107105
108106
} // end KokkosComm::Impl
109107

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Documentation Content
2929

3030
design/overview
3131
design/mpi_interop
32+
design/nccl_interop
3233

3334
.. toctree::
3435
:maxdepth: 1

perf_tests/CMakeLists.txt

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,14 @@ endif()
1818

1919
include(FetchContent)
2020

21-
# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
22-
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
23-
cmake_policy(SET CMP0135 NEW)
24-
endif()
25-
21+
FetchContent_Declare(benchmark
22+
GIT_REPOSITORY https://github.com/google/benchmark.git
23+
GIT_TAG
24+
eddb0241389718a23a42db6af5f0164b6e0139af # v1.9.4
25+
)
2626
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
27-
FetchContent_Declare(benchmark URL https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip)
28-
# FetchContent_MakeAvailable(benchmark) was making install benchmark as well
29-
# EXCLUDE_FROM_ALL here seems to be the magic
30-
if(NOT benchmark_POPULATED)
31-
FetchContent_Populate(benchmark)
32-
add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR} EXCLUDE_FROM_ALL)
33-
endif()
34-
unset(BENCHMARK_ENABLE_TESTING)
27+
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
28+
FetchContent_MakeAvailable(benchmark)
3529

3630
if(KOKKOSCOMM_ENABLE_MPI)
3731
add_subdirectory(mpi)

src/KokkosComm/collective.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,11 @@
2222

2323
namespace KokkosComm::Experimental {
2424

25-
/// Copy the `sv` view on the `root` rank to all ranks' `rv` view.
26-
///
27-
/// The `sv` view is only used on the `root` rank and ignored for all other ranks.
28-
template <KokkosView SendView, KokkosView RecvView, KokkosExecutionSpace ExecSpace = Kokkos::DefaultExecutionSpace,
29-
CommunicationSpace CommSpace = DefaultCommunicationSpace>
30-
auto broadcast(Handle<ExecSpace, CommSpace>& h, const SendView sv, RecvView rv, int root) -> Req<CommSpace> {
31-
return Impl::Broadcast<SendView, RecvView, ExecSpace, CommSpace>::execute(h, sv, rv, root);
32-
}
33-
34-
/// In-place variant of `broadcast`. Copy the `v` view from the `root` rank to all ranks' `v` view.
25+
/// Copy the `v` view from the `root` rank to all ranks' `v` view.
3526
template <KokkosView View, KokkosExecutionSpace ExecSpace = Kokkos::DefaultExecutionSpace,
3627
CommunicationSpace CommSpace = DefaultCommunicationSpace>
3728
auto broadcast(Handle<ExecSpace, CommSpace>& h, View v, int root) -> Req<CommSpace> {
38-
return Impl::Broadcast<View, View, ExecSpace, CommSpace>::execute(h, v, v, root);
29+
return Impl::Broadcast<View, ExecSpace, CommSpace>::execute(h, v, root);
3930
}
4031

4132
/// Copy the `sv` view from each rank to the `rv` view, receiving data from rank `i` at offset

src/KokkosComm/concepts.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,15 @@ template <typename T>
2727
concept KokkosExecutionSpace = Kokkos::is_execution_space_v<T>;
2828

2929
template <typename T>
30-
concept CommunicationSpace = KokkosComm::Impl::is_communication_space<T>::value;
30+
concept CommunicationSpace = requires {
31+
KokkosComm::Impl::is_communication_space<T>::value;
32+
typename T::communication_space;
33+
typename T::handle_type;
34+
typename T::request_type;
35+
typename T::datatype_type;
36+
typename T::reduction_op_type;
37+
typename T::rank_type;
38+
};
3139

3240
template <typename T>
3341
concept ReductionOperator = KokkosComm::Impl::is_reduction_operator<T>::value;

src/KokkosComm/fwd.hpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,19 @@
99

1010
namespace KokkosComm {
1111

12-
// NCCL backend also implicitly declares MPI
1312
#if defined(KOKKOSCOMM_ENABLE_NCCL)
1413
namespace Experimental {
15-
class Nccl;
16-
} // namespace Experimental
17-
class Mpi;
18-
using DefaultCommunicationSpace = Experimental::Nccl;
19-
using FallbackCommunicationSpace = Mpi;
14+
struct NcclSpace;
15+
}
16+
// NCCL backend also declares the MPI space as fallback
17+
struct MpiSpace;
18+
19+
using DefaultCommunicationSpace = Experimental::NcclSpace;
20+
using FallbackCommunicationSpace = MpiSpace;
2021
#elif defined(KOKKOSCOMM_ENABLE_MPI)
21-
class Mpi;
22-
using DefaultCommunicationSpace = Mpi;
23-
using FallbackCommunicationSpace = Mpi;
22+
struct MpiSpace;
23+
using DefaultCommunicationSpace = MpiSpace;
24+
using FallbackCommunicationSpace = MpiSpace;
2425
#else
2526
#error at least one communication space must be enabled
2627
#endif
@@ -47,7 +48,7 @@ struct Send;
4748
// Collectives are currently experimental functions
4849
namespace Experimental::Impl {
4950

50-
template <KokkosView SendView, KokkosView RecvView, KokkosExecutionSpace ExecSpace = Kokkos::DefaultExecutionSpace,
51+
template <KokkosView View, KokkosExecutionSpace ExecSpace = Kokkos::DefaultExecutionSpace,
5152
CommunicationSpace CommSpace = DefaultCommunicationSpace>
5253
struct Broadcast;
5354

src/KokkosComm/mpi/allgather.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,38 @@
88

99
#include <KokkosComm/concepts.hpp>
1010
#include <KokkosComm/traits.hpp>
11+
#include "mpi_space.hpp"
12+
#include "req.hpp"
1113

1214
#include "impl/types.hpp"
1315
#include "impl/error_handling.hpp"
1416

1517
namespace KokkosComm::mpi {
1618

19+
template <KokkosExecutionSpace ExecSpace, KokkosView SView, KokkosView RView>
20+
auto iallgather(const ExecSpace &space, const SView sv, RView rv, MPI_Comm comm) -> Req<MpiSpace> {
21+
using ST = typename SView::non_const_value_type;
22+
using RT = typename RView::non_const_value_type;
23+
static_assert(std::is_same_v<ST, RT>, "KokkosComm::mpi::iallgather: View value types must be identical");
24+
Kokkos::Tools::pushRegion("KokkosComm::mpi::iallgather");
25+
26+
fail_if(!is_contiguous(sv) || !is_contiguous(rv),
27+
"KokkosComm::mpi::iallgather: unimplemented for non-contiguous views");
28+
29+
// Sync: Work in space may have been used to produce view data.
30+
space.fence("fence before non-blocking all-gather");
31+
32+
Req<MpiSpace> req;
33+
// All ranks send/recv same count
34+
MPI_Iallgather(data_handle(sv), span(sv), Impl::mpi_type_v<ST>, data_handle(rv), span(sv), Impl::mpi_type_v<RT>, comm,
35+
&req.mpi_request());
36+
req.extend_view_lifetime(sv);
37+
req.extend_view_lifetime(rv);
38+
39+
Kokkos::Tools::popRegion();
40+
return req;
41+
}
42+
1743
template <KokkosView SendView, KokkosView RecvView>
1844
void allgather(const SendView &sv, const RecvView &rv, MPI_Comm comm) {
1945
Kokkos::Tools::pushRegion("KokkosComm::Mpi::allgather");

0 commit comments

Comments
 (0)