Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 107 additions & 39 deletions .github/workflows/snl-h100.yaml
Original file line number Diff line number Diff line change
@@ -1,65 +1,133 @@
name: H100


permissions:
contents: none

on:
workflow_call:

jobs:
PR_CUDA1250_OPENMPI504:
name: PR_CUDA1250_OPENMPI504
runs-on: [cuda125-openmpi504-latest-latest]
PR_CUDA1262_OPENMPI505:
name: PR_CUDA1262_OPENMPI505

# this label is correct, but the underlying AT2 runner is actually
# CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
runs-on: [cuda125-openmpi504-latest-latest]
steps:
- name: Checkout Kokkos Comm
- name: Check NVIDIA GPU
run: nvidia-smi

- name: Kokkos - Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: kokkos/kokkos
ref: 4.7.01
path: kokkos
- name: Kokkos - Configure
run: >
cmake
-S kokkos
-B kokkos/build
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
-DCMAKE_CXX_STANDARD=20
-DCMAKE_CXX_EXTENSIONS=OFF
-DCMAKE_INSTALL_PREFIX=kokkos/install
-DKokkos_ENABLE_CUDA=ON
-DKokkos_ARCH_HOPPER90=ON
-DKokkos_ENABLE_TESTS=OFF
-DKokkos_ENABLE_EXAMPLES=OFF
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
- name: Kokkos - Build
run: cmake --build kokkos/build --parallel $(nproc)
- name: Kokkos - Install
run: cmake --build kokkos/build --target install --parallel $(nproc)
- name: KokkosComm - Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
path: kokkos-comm
- name: KokkosComm - Configure MPI backend
run: >
cmake
-S kokkos-comm
-B build-mpi
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
-DCMAKE_CXX_STANDARD=20
-DCMAKE_CXX_EXTENSIONS=OFF
-DCMAKE_CXX_FLAGS="-Werror"
-DKokkos_ROOT=kokkos/install
-DKokkosComm_ENABLE_MPI=ON
-DKokkosComm_ENABLE_TESTS=ON
-DKokkosComm_ENABLE_PERFTESTS=ON
- name: KokkosComm - Build MPI backend
run: cmake --build build-mpi --parallel $(nproc)
- name: KokkosComm - Test MPI backend
working-directory: build-mpi
run: ctest --output-on-failure -V --timeout 1200

- name: Checkout Kokkos
PR_CUDA1262_NCCL2275:
name: PR_CUDA1262_NCCL2275

# this label is correct, but the underlying AT2 runner is actually
# CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
runs-on: [cuda125-openmpi504-latest-latest]
steps:
- name: Check NVIDIA GPU
run: nvidia-smi

- name: Kokkos - Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: kokkos/kokkos
ref: 4.5.01
ref: 4.7.01
path: kokkos

- name: nvidia-smi
run: nvidia-smi
- name: Kokkos - Configure
run: >
cmake
-S kokkos
-B kokkos/build
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
-DCMAKE_CXX_STANDARD=20
-DCMAKE_CXX_EXTENSIONS=OFF
-DCMAKE_INSTALL_PREFIX=kokkos/install
-DKokkos_ENABLE_CUDA=ON
-DKokkos_ARCH_HOPPER90=ON
-DKokkos_ENABLE_TESTS=OFF
-DKokkos_ENABLE_EXAMPLES=OFF
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
- name: Kokkos - Build
run: cmake --build kokkos/build --parallel $(nproc)

- name: Configure Kokkos
run: |
cmake -S kokkos -B kokkos/build \
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
-DCMAKE_CXX_STANDARD=20 \
-DCMAKE_CXX_EXTENSIONS=OFF \
-DCMAKE_INSTALL_PREFIX=kokkos/install \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ARCH_HOPPER90=ON \
-DKokkos_ENABLE_TESTS=OFF \
-DKokkos_ENABLE_EXAMPLES=OFF \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF

- name: Build Kokkos
- name: Kokkos - Install
run: cmake --build kokkos/build --target install --parallel $(nproc)

- name: Configure Kokkos Comm
run: |
cmake -S kokkos-comm -B build \
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
-DCMAKE_CXX_STANDARD=20 \
-DCMAKE_CXX_EXTENSIONS=OFF \
-DCMAKE_CXX_FLAGS="-Werror" \
-DKokkos_ROOT=kokkos/install \
-DKokkosComm_ENABLE_TESTS=ON \
-DKokkosComm_ENABLE_PERFTESTS=ON
- name: KokkosComm - Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
path: kokkos-comm

- name: KokkosComm - Configure NCCL backend
# FIXME_NCCL: no performance tests on NCCL
run: >
cmake
-S kokkos-comm
-B build-nccl
-DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
-DCMAKE_CXX_STANDARD=20
-DCMAKE_CXX_EXTENSIONS=OFF
-DKokkos_ROOT=kokkos/install
-DKokkosComm_ENABLE_MPI=OFF
-DKokkosComm_ENABLE_NCCL=ON
-DKokkosComm_ENABLE_TESTS=ON
-DKokkosComm_ENABLE_PERFTESTS=OFF

- name: Build Kokkos Comm
run: cmake --build build --parallel $(nproc)
- name: KokkosComm - Build NCCL backend
run: cmake --build build-nccl --parallel $(nproc)

- name: Test Kokkos Comm
working-directory: build
- name: KokkosComm - Test NCCL backend
working-directory: build-nccl
run: ctest --output-on-failure -V --timeout 1200
14 changes: 14 additions & 0 deletions docs/design/nccl_interop.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
*********************
NCCL interoperability
*********************

There are several challenges with supporting NCCL.

* For multi-process NCCL, we need a way to share a unique ID between processes so that the different processes know they're part of the same NCCL communicator. For the time being, this is accomplished via MPI in the nccl tests.

* NCCL has the concept of a non-blocking communicator. This causes all NCCL operations to potentially return ``ncclInProgress`` BEFORE they actually put an GPU operations into streams. This means we can't just synchronize on a CUDA stream in the NCCL backend's ``wait`` implementation. Either:

* our NCCL operations need to effectively become blocking (checking NCCL's async status thing until it's no longer in progress)
* our ``wait`` implementation needs to do that

* It is not guaranteed to be safe for NCCL operations and GPU-aware MPI operations to be simultaneously active on the same set of GPUs. This is a challenge both if we permit multiple backends to exist, and for interop with existing MPI and/or NCCL applications.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Documentation Content

design/overview
design/mpi_interop
design/nccl_interop

.. toctree::
:maxdepth: 1
Expand Down
20 changes: 7 additions & 13 deletions perf_tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,14 @@ endif()

include(FetchContent)

# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()

FetchContent_Declare(benchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG
eddb0241389718a23a42db6af5f0164b6e0139af # v1.9.4
)
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
FetchContent_Declare(benchmark URL https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip)
# FetchContent_MakeAvailable(benchmark) was making install benchmark as well
# EXCLUDE_FROM_ALL here seems to be the magic
if(NOT benchmark_POPULATED)
FetchContent_Populate(benchmark)
add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()
unset(BENCHMARK_ENABLE_TESTING)
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(benchmark)

if(KOKKOSCOMM_ENABLE_MPI)
add_subdirectory(mpi)
Expand Down
14 changes: 14 additions & 0 deletions src/KokkosComm/nccl/impl/cuda_check.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#pragma once

#if defined(KOKKOSCOMM_ENABLE_NCCL)
#define KC_CUDA_CHECK(expr) \
([&]() { \
cudaError_t kcErr = (expr); \
if (cudaSuccess != kcErr) { \
std::cerr << __FILE__ << ":" << __LINE__ << ": CUDA Error: " << cudaGetErrorString(kcErr); \
} \
}())
#endif
14 changes: 14 additions & 0 deletions src/KokkosComm/nccl/impl/nccl_check.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project

#pragma once

#if defined(KOKKOSCOMM_ENABLE_NCCL)
#define KC_NCCL_CHECK(expr) \
([&]() { \
ncclResult_t kcRes = (expr); \
if (ncclSuccess != kcRes) { \
std::cerr << __FILE__ << ":" << __LINE__ << ": NCCL Error: " << ncclGetErrorString(kcRes); \
} \
}())
#endif
2 changes: 1 addition & 1 deletion src/KokkosComm/nccl/impl/packer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct DeepCopy {
static auto pack(const ExecSpace &space, const View &src) -> PackedNcclView<PackedView> {
PackedView packed_src = KokkosComm::Impl::allocate_contiguous_for(space, "DeepCopy::pack", src);
// Use `ncclUint8` because there is no equivalent to `MPI_PACKED`.
PackedNcclView<PackedView> packed(packed_src, ncclUint8, span(src) * sizeof(typename PackedView::value_type));
PackedNcclView<PackedView> packed(packed_src, ncclUint8, src.size() * sizeof(typename PackedView::value_type));
Kokkos::deep_copy(space, packed.view_, src);
return packed;
}
Expand Down
5 changes: 3 additions & 2 deletions src/KokkosComm/nccl/recv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ auto recv(const ExecSpace &space, RecvView &rv, int peer, ncclComm_t comm) -> Re

Req<Nccl> req{space.cuda_stream()};
if (KC::is_contiguous(rv)) {
ncclRecv(KC::data_handle(rv), KC::span(rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
KC_NCCL_CHECK(ncclRecv(KC::data_handle(rv), KC::span(rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
} else {
using Packer = typename Impl::PackTraits<RecvView>::packer_type;
auto pckd_rv = KC::Impl::allocate_contiguous_for(space, "KC::nccl::recv pckd_rv", rv);
ncclRecv(KC::data_handle(pckd_rv), KC::span(pckd_rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
KC_NCCL_CHECK(
ncclRecv(KC::data_handle(pckd_rv), KC::span(pckd_rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
Packer::unpack_into(space, rv, pckd_rv);
req.extend_view_lifetime(pckd_rv);
}
Expand Down
10 changes: 8 additions & 2 deletions src/KokkosComm/nccl/req.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <KokkosComm/fwd.hpp>
#include "nccl_space.hpp"

#include "impl/cuda_check.hpp"

namespace KokkosComm {

template <>
Expand Down Expand Up @@ -62,7 +64,7 @@ class Req<Experimental::Nccl> {
};

inline auto wait(Req<Experimental::Nccl> &req) -> void {
cudaStreamSynchronize(req.get_inner());
KC_CUDA_CHECK(cudaStreamSynchronize(req.get_inner()));
for (auto &f : req.record_->postWaits_) {
f();
}
Expand All @@ -81,12 +83,16 @@ inline auto wait_any(std::span<Req<Experimental::Nccl>> reqs) -> int {
// Loop while we don't have at least one completed request.
while (true) {
for (size_t r = 0; r < reqs.size(); ++r) {
auto res = cudaStreamQuery(reqs[r].get_inner());
cudaError_t res = cudaStreamQuery(reqs[r].get_inner());
// If the current request has completed, we must make sure the post-wait callbacks run and are cleared.
// Calling `wait` should be a no-op if the request has no callback to execute.
if (res == cudaSuccess) {
wait(reqs[r]);
return static_cast<int>(r);
} else if (res == cudaErrorNotReady) {
continue;
} else {
throw std::runtime_error(cudaGetErrorString(res));
}
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/KokkosComm/nccl/send.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "impl/types.hpp"
#include "impl/pack_traits.hpp"
#include "impl/nccl_check.hpp"

namespace KokkosComm {
namespace Experimental::nccl {
Expand All @@ -24,11 +25,12 @@ auto send(const ExecSpace& space, const SendView& sv, int peer, ncclComm_t comm)

Req<Nccl> req{space.cuda_stream()};
if (KC::is_contiguous(sv)) {
ncclSend(KC::data_handle(sv), KC::span(sv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
KC_NCCL_CHECK(ncclSend(KC::data_handle(sv), KC::span(sv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
} else {
using Packer = typename Impl::PackTraits<SendView>::packer_type;
auto args = Packer::pack(space, sv);
ncclSend(KC::data_handle(args.view_), args.count_, Impl::datatype_v<T>, peer, comm, space.cuda_stream());
KC_NCCL_CHECK(
ncclSend(KC::data_handle(args.view_), args.count_, Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
req.extend_view_lifetime(args.view_);
}
req.extend_view_lifetime(sv);
Expand Down
Loading
Loading