kokkos · cwpearson · Nov 21, 2025 · Oct 23, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/.github/workflows/snl-h100.yaml b/.github/workflows/snl-h100.yaml
@@ -1,65 +1,133 @@
 name: H100
 
-
 permissions:
   contents: none
 
 on:
   workflow_call:
 
 jobs:
-  PR_CUDA1250_OPENMPI504:
-    name: PR_CUDA1250_OPENMPI504
-    runs-on: [cuda125-openmpi504-latest-latest]
+  PR_CUDA1262_OPENMPI505:
+    name: PR_CUDA1262_OPENMPI505
 
+    # this label is correct, but the underlying AT2 runner is actually
+    # CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
+    runs-on: [cuda125-openmpi504-latest-latest]
     steps:
-      - name: Checkout Kokkos Comm
+      - name: Check NVIDIA GPU
+        run: nvidia-smi
+
+      - name: Kokkos - Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: kokkos/kokkos
+          ref: 4.7.01
+          path: kokkos
+      - name: Kokkos - Configure
+        run: >
+          cmake
+          -S kokkos
+          -B kokkos/build
+          -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
+          -DCMAKE_CXX_STANDARD=20
+          -DCMAKE_CXX_EXTENSIONS=OFF
+          -DCMAKE_INSTALL_PREFIX=kokkos/install
+          -DKokkos_ENABLE_CUDA=ON
+          -DKokkos_ARCH_HOPPER90=ON
+          -DKokkos_ENABLE_TESTS=OFF
+          -DKokkos_ENABLE_EXAMPLES=OFF
+          -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
+          -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
+          -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
+      - name: Kokkos - Build
+        run: cmake --build kokkos/build --parallel $(nproc)
+      - name: Kokkos - Install
+        run: cmake --build kokkos/build --target install --parallel $(nproc)
+      - name: KokkosComm - Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           path: kokkos-comm
+      - name: KokkosComm - Configure MPI backend
+        run: >
+          cmake
+          -S kokkos-comm
+          -B build-mpi
+          -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
+          -DCMAKE_CXX_STANDARD=20
+          -DCMAKE_CXX_EXTENSIONS=OFF
+          -DCMAKE_CXX_FLAGS="-Werror"
+          -DKokkos_ROOT=kokkos/install
+          -DKokkosComm_ENABLE_MPI=ON
+          -DKokkosComm_ENABLE_TESTS=ON
+          -DKokkosComm_ENABLE_PERFTESTS=ON
+      - name: KokkosComm - Build MPI backend
+        run: cmake --build build-mpi --parallel $(nproc)
+      - name: KokkosComm - Test MPI backend
+        working-directory: build-mpi
+        run: ctest --output-on-failure -V --timeout 1200
 
-      - name: Checkout Kokkos
+  PR_CUDA1262_NCCL2275:
+    name: PR_CUDA1262_NCCL2275
+
+    # this label is correct, but the underlying AT2 runner is actually
+    # CUDA 12.6.2 and OpenMPI 5.0.5 <facepalm>
+    runs-on: [cuda125-openmpi504-latest-latest]
+    steps:
+      - name: Check NVIDIA GPU
+        run: nvidia-smi
+
+      - name: Kokkos - Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: kokkos/kokkos
-          ref: 4.5.01
+          ref: 4.7.01
           path: kokkos
 
-      - name: nvidia-smi
-        run: nvidia-smi
+      - name: Kokkos - Configure
+        run: >
+          cmake
+          -S kokkos
+          -B kokkos/build
+          -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
+          -DCMAKE_CXX_STANDARD=20
+          -DCMAKE_CXX_EXTENSIONS=OFF
+          -DCMAKE_INSTALL_PREFIX=kokkos/install
+          -DKokkos_ENABLE_CUDA=ON
+          -DKokkos_ARCH_HOPPER90=ON
+          -DKokkos_ENABLE_TESTS=OFF
+          -DKokkos_ENABLE_EXAMPLES=OFF
+          -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
+          -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF
+          -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
+      - name: Kokkos - Build
+        run: cmake --build kokkos/build --parallel $(nproc)
 
-      - name: Configure Kokkos
-        run: |
-          cmake -S kokkos -B kokkos/build \
-            -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
-            -DCMAKE_CXX_STANDARD=20 \
-            -DCMAKE_CXX_EXTENSIONS=OFF \
-            -DCMAKE_INSTALL_PREFIX=kokkos/install \
-            -DKokkos_ENABLE_CUDA=ON \
-            -DKokkos_ARCH_HOPPER90=ON \
-            -DKokkos_ENABLE_TESTS=OFF \
-            -DKokkos_ENABLE_EXAMPLES=OFF \
-            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-            -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
-            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF
-
-      - name: Build Kokkos
+      - name: Kokkos - Install
         run: cmake --build kokkos/build --target install --parallel $(nproc)
 
-      - name: Configure Kokkos Comm
-        run: |
-          cmake -S kokkos-comm -B build \
-            -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper) \
-            -DCMAKE_CXX_STANDARD=20 \
-            -DCMAKE_CXX_EXTENSIONS=OFF \
-            -DCMAKE_CXX_FLAGS="-Werror" \
-            -DKokkos_ROOT=kokkos/install \
-            -DKokkosComm_ENABLE_TESTS=ON \
-            -DKokkosComm_ENABLE_PERFTESTS=ON
+      - name: KokkosComm - Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: kokkos-comm
+
+      - name: KokkosComm - Configure NCCL backend
+        # FIXME_NCCL: no performance tests on NCCL
+        run: >
+          cmake
+          -S kokkos-comm
+          -B build-nccl
+          -DCMAKE_CXX_COMPILER=$(realpath kokkos/bin/nvcc_wrapper)
+          -DCMAKE_CXX_STANDARD=20
+          -DCMAKE_CXX_EXTENSIONS=OFF
+          -DKokkos_ROOT=kokkos/install
+          -DKokkosComm_ENABLE_MPI=OFF
+          -DKokkosComm_ENABLE_NCCL=ON
+          -DKokkosComm_ENABLE_TESTS=ON
+          -DKokkosComm_ENABLE_PERFTESTS=OFF
 
-      - name: Build Kokkos Comm
-        run: cmake --build build --parallel $(nproc)
+      - name: KokkosComm - Build NCCL backend
+        run: cmake --build build-nccl --parallel $(nproc)
 
-      - name: Test Kokkos Comm
-        working-directory: build
+      - name: KokkosComm - Test NCCL backend
+        working-directory: build-nccl
         run: ctest --output-on-failure -V --timeout 1200
diff --git a/docs/design/nccl_interop.rst b/docs/design/nccl_interop.rst
@@ -0,0 +1,14 @@
+*********************
+NCCL interoperability
+*********************
+
+There are several challenges with supporting NCCL.
+
+* For multi-process NCCL, we need a way to share a unique ID between processes so that the different processes know they're part of the same NCCL communicator. For the time being, this is accomplished via MPI in the nccl tests.
+
+* NCCL has the concept of a non-blocking communicator. This causes all NCCL operations to potentially return ``ncclInProgress`` BEFORE they actually put an GPU operations into streams. This means we can't just synchronize on a CUDA stream in the NCCL backend's ``wait`` implementation. Either:
+
+  * our NCCL operations need to effectively become blocking (checking NCCL's async status thing until it's no longer in progress)
+  * our ``wait`` implementation needs to do that
+
+* It is not guaranteed to be safe for NCCL operations and GPU-aware MPI operations to be simultaneously active on the same set of GPUs. This is a challenge both if we permit multiple backends to exist, and for interop with existing MPI and/or NCCL applications.
diff --git a/docs/index.rst b/docs/index.rst
@@ -29,6 +29,7 @@ Documentation Content
 
    design/overview
    design/mpi_interop
+   design/nccl_interop
 
 .. toctree::
    :maxdepth: 1

diff --git a/perf_tests/CMakeLists.txt b/perf_tests/CMakeLists.txt
@@ -18,20 +18,14 @@ endif()
 
 include(FetchContent)
 
-# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
-if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
-  cmake_policy(SET CMP0135 NEW)
-endif()
-
+FetchContent_Declare(benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG
+    eddb0241389718a23a42db6af5f0164b6e0139af # v1.9.4
+)
 set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
-FetchContent_Declare(benchmark URL https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip)
-# FetchContent_MakeAvailable(benchmark) was making install benchmark as well
-# EXCLUDE_FROM_ALL here seems to be the magic
-if(NOT benchmark_POPULATED)
-  FetchContent_Populate(benchmark)
-  add_subdirectory(${benchmark_SOURCE_DIR} ${benchmark_BINARY_DIR} EXCLUDE_FROM_ALL)
-endif()
-unset(BENCHMARK_ENABLE_TESTING)
+set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(benchmark)
 
 if(KOKKOSCOMM_ENABLE_MPI)
   add_subdirectory(mpi)

diff --git a/src/KokkosComm/nccl/impl/cuda_check.hpp b/src/KokkosComm/nccl/impl/cuda_check.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
+
+#pragma once
+
+#if defined(KOKKOSCOMM_ENABLE_NCCL)
+#define KC_CUDA_CHECK(expr)                                                                      \
+  ([&]() {                                                                                       \
+    cudaError_t kcErr = (expr);                                                                  \
+    if (cudaSuccess != kcErr) {                                                                  \
+      std::cerr << __FILE__ << ":" << __LINE__ << ": CUDA Error: " << cudaGetErrorString(kcErr); \
+    }                                                                                            \
+  }())
+#endif
diff --git a/src/KokkosComm/nccl/impl/nccl_check.hpp b/src/KokkosComm/nccl/impl/nccl_check.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project
+
+#pragma once
+
+#if defined(KOKKOSCOMM_ENABLE_NCCL)
+#define KC_NCCL_CHECK(expr)                                                                      \
+  ([&]() {                                                                                       \
+    ncclResult_t kcRes = (expr);                                                                 \
+    if (ncclSuccess != kcRes) {                                                                  \
+      std::cerr << __FILE__ << ":" << __LINE__ << ": NCCL Error: " << ncclGetErrorString(kcRes); \
+    }                                                                                            \
+  }())
+#endif
diff --git a/src/KokkosComm/nccl/impl/packer.hpp b/src/KokkosComm/nccl/impl/packer.hpp
@@ -32,7 +32,7 @@ struct DeepCopy {
   static auto pack(const ExecSpace &space, const View &src) -> PackedNcclView<PackedView> {
     PackedView packed_src = KokkosComm::Impl::allocate_contiguous_for(space, "DeepCopy::pack", src);
     // Use `ncclUint8` because there is no equivalent to `MPI_PACKED`.
-    PackedNcclView<PackedView> packed(packed_src, ncclUint8, span(src) * sizeof(typename PackedView::value_type));
+    PackedNcclView<PackedView> packed(packed_src, ncclUint8, src.size() * sizeof(typename PackedView::value_type));
     Kokkos::deep_copy(space, packed.view_, src);
     return packed;
   }

diff --git a/src/KokkosComm/nccl/recv.hpp b/src/KokkosComm/nccl/recv.hpp
@@ -25,11 +25,12 @@ auto recv(const ExecSpace &space, RecvView &rv, int peer, ncclComm_t comm) -> Re
 
   Req<Nccl> req{space.cuda_stream()};
   if (KC::is_contiguous(rv)) {
-    ncclRecv(KC::data_handle(rv), KC::span(rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
+    KC_NCCL_CHECK(ncclRecv(KC::data_handle(rv), KC::span(rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
   } else {
     using Packer = typename Impl::PackTraits<RecvView>::packer_type;
     auto pckd_rv = KC::Impl::allocate_contiguous_for(space, "KC::nccl::recv pckd_rv", rv);
-    ncclRecv(KC::data_handle(pckd_rv), KC::span(pckd_rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
+    KC_NCCL_CHECK(
+        ncclRecv(KC::data_handle(pckd_rv), KC::span(pckd_rv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
     Packer::unpack_into(space, rv, pckd_rv);
     req.extend_view_lifetime(pckd_rv);
   }

diff --git a/src/KokkosComm/nccl/req.hpp b/src/KokkosComm/nccl/req.hpp
@@ -14,6 +14,8 @@
 #include <KokkosComm/fwd.hpp>
 #include "nccl_space.hpp"
 
+#include "impl/cuda_check.hpp"
+
 namespace KokkosComm {
 
 template <>
@@ -62,7 +64,7 @@ class Req<Experimental::Nccl> {
 };
 
 inline auto wait(Req<Experimental::Nccl> &req) -> void {
-  cudaStreamSynchronize(req.get_inner());
+  KC_CUDA_CHECK(cudaStreamSynchronize(req.get_inner()));
   for (auto &f : req.record_->postWaits_) {
     f();
   }
@@ -81,12 +83,16 @@ inline auto wait_any(std::span<Req<Experimental::Nccl>> reqs) -> int {
   // Loop while we don't have at least one completed request.
   while (true) {
     for (size_t r = 0; r < reqs.size(); ++r) {
-      auto res = cudaStreamQuery(reqs[r].get_inner());
+      cudaError_t res = cudaStreamQuery(reqs[r].get_inner());
       // If the current request has completed, we must make sure the post-wait callbacks run and are cleared.
       // Calling `wait` should be a no-op if the request has no callback to execute.
       if (res == cudaSuccess) {
         wait(reqs[r]);
         return static_cast<int>(r);
+      } else if (res == cudaErrorNotReady) {
+        continue;
+      } else {
+        throw std::runtime_error(cudaGetErrorString(res));
       }
     }
   }

diff --git a/src/KokkosComm/nccl/send.hpp b/src/KokkosComm/nccl/send.hpp
@@ -11,6 +11,7 @@
 
 #include "impl/types.hpp"
 #include "impl/pack_traits.hpp"
+#include "impl/nccl_check.hpp"
 
 namespace KokkosComm {
 namespace Experimental::nccl {
@@ -24,11 +25,12 @@ auto send(const ExecSpace& space, const SendView& sv, int peer, ncclComm_t comm)
 
   Req<Nccl> req{space.cuda_stream()};
   if (KC::is_contiguous(sv)) {
-    ncclSend(KC::data_handle(sv), KC::span(sv), Impl::datatype_v<T>, peer, comm, space.cuda_stream());
+    KC_NCCL_CHECK(ncclSend(KC::data_handle(sv), KC::span(sv), Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
   } else {
     using Packer = typename Impl::PackTraits<SendView>::packer_type;
     auto args    = Packer::pack(space, sv);
-    ncclSend(KC::data_handle(args.view_), args.count_, Impl::datatype_v<T>, peer, comm, space.cuda_stream());
+    KC_NCCL_CHECK(
+        ncclSend(KC::data_handle(args.view_), args.count_, Impl::datatype_v<T>, peer, comm, space.cuda_stream()));
     req.extend_view_lifetime(args.view_);
   }
   req.extend_view_lifetime(sv);