add check for nvls fp8 support

Binyang2014 · Binyang2014 · commit cd9d504a99a7 · 2026-06-05T18:19:04.000Z
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
@@ -176,6 +176,7 @@ struct NvlsBlockPipelineAdapter {
 
 void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
   int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold up to 4 * nRanksPerIpcDomain entries (see kernel).
   int nBaseChannels = std::max(64, 4 * nRanksPerIpcDomain);
@@ -194,6 +195,10 @@ CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(
     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
     [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    return CommResult::CommInvalidArgument;
+  }
   AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu b/src/ext/collectives/allreduce/allreduce_nvls_warp_pipeline.cu
@@ -140,6 +140,7 @@ struct NvlsWarpPipelineAdapter {
 
 void AllreduceNvlsWarpPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = NUM_NVLS_CONNECTION;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
   int nRanksPerIpcDomain = comm->bootstrap()->getNranksPerIpcDomain();
   // Per-peer channel allocation must hold 2 * nBlocks entries; default nBlocks = 4 * nRanksPerIpcDomain.
   int nBaseChannels = std::max(64, 8 * nRanksPerIpcDomain);
@@ -158,6 +159,10 @@ CommResult AllreduceNvlsWarpPipeline::allreduceKernelFunc(
     ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
     [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras, DataType accumDtype) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    return CommResult::CommInvalidArgument;
+  }
   AllreduceFunc allreduce = dispatch<NvlsWarpPipelineAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -97,6 +97,7 @@ void AllreduceNvls::initialize(std::shared_ptr<mscclpp::Communicator> comm) {
   cudaDeviceProp deviceProp;
   MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device));
   computeCapabilityMajor_ = deviceProp.major;
+  fp8NvlsSupported_ = isFp8NvlsSupported();
   nSwitchChannels_ = 32;
   this->conns_ = setupConnections(comm);
   // setup semaphores
@@ -119,13 +120,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-#if defined(__FP8_TYPES_EXIST__)
-  bool isFp8Dtype = dtype == mscclpp::DataType::FLOAT8_E4M3FN || dtype == mscclpp::DataType::FLOAT8_E5M2;
-  if (isFp8Dtype && computeCapabilityMajor_ < 10) {
-    WARN("FP8 NVLS allreduce requires compute capability 10.x or newer.");
+  if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
+    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
     return CommResult::CommInvalidArgument;
   }
-#endif
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -197,12 +197,7 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
 
   // FP8-specific tuning for 32KB-256KB range
   {
-    bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
-#if defined(__FP8_TYPES_EXIST__)
-    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
-            dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
-#endif
-    if (isFp8) {
+    if (isFp8DataType(dtype)) {
       if (inputSize < (64 << 10)) {
         nThreadsPerBlock = 64;
       } else if (inputSize >= (64 << 10) && inputSize <= (128 << 10)) {
diff --git a/src/ext/collectives/collective_utils.cu b/src/ext/collectives/collective_utils.cu
@@ -6,11 +6,89 @@
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
 namespace mscclpp {
 namespace collective {
+
+namespace {
+
+#if !defined(MSCCLPP_DEVICE_HIP)
+__global__ void fp8NvlsSupportProbeKernel(int* supported) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
+    (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
+  *supported = 1;
+#else
+  *supported = 0;
+#endif
+}
+
+bool detectFp8NvlsSupport() {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  auto supportedDevice = mscclpp::detail::gpuCallocUnique<int>();
+  int supportedHost = 0;
+  auto stream = gpuStreamPool()->getStream();
+
+  fp8NvlsSupportProbeKernel<<<1, 1, 0, stream>>>(supportedDevice.get());
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    return false;
+  }
+
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(&supportedHost, supportedDevice.get(), sizeof(supportedHost),
+                                    cudaMemcpyDeviceToHost, stream));
+  err = cudaStreamSynchronize(stream);
+  if (err != cudaSuccess) {
+    (void)cudaGetLastError();
+    return false;
+  }
+  return supportedHost != 0;
+}
+#endif
+
+}  // namespace
+
+bool isFp8DataType(DataType dtype) {
+  return dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
+         dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ ||
+         dtype == DataType::FLOAT8_E4M3B15;
+}
+
+bool isNativeFp8DataType(DataType dtype) {
+#if defined(__FP8_TYPES_EXIST__)
+#if defined(__FP8_E4M3_IS_FNUZ__)
+  if (dtype == DataType::FLOAT8_E4M3FNUZ) {
+    return true;
+  }
+#else
+  if (dtype == DataType::FLOAT8_E4M3FN) {
+    return true;
+  }
+#endif
+#if defined(__FP8_E5M2_IS_FNUZ__)
+  if (dtype == DataType::FLOAT8_E5M2FNUZ) {
+    return true;
+  }
+#else
+  if (dtype == DataType::FLOAT8_E5M2) {
+    return true;
+  }
+#endif
+#endif
+  return false;
+}
+
+bool isFp8NvlsSupported() {
+#if defined(MSCCLPP_DEVICE_HIP)
+  return false;
+#else
+  static const bool supported = detectFp8NvlsSupport();
+  return supported;
+#endif
+}
+
 std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
                                                            mscclpp::RegisteredMemory localMemory) {
   std::vector<mscclpp::RegisteredMemory> remoteMemories;
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_block_pipeline.hpp
@@ -33,6 +33,7 @@ class AllreduceNvlsBlockPipeline : public AlgorithmBuilder {
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  bool fp8NvlsSupported_{false};
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_warp_pipeline.hpp
@@ -33,6 +33,7 @@ class AllreduceNvlsWarpPipeline : public AlgorithmBuilder {
   std::vector<BaseMemoryChannel> baseChannels_;
   std::vector<Connection> conns_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
+  bool fp8NvlsSupported_{false};
 };
 }  // namespace collective
 }  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_zero_copy.hpp
@@ -36,6 +36,7 @@ class AllreduceNvls : public AlgorithmBuilder {
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections_;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsOutConnections_;
   int computeCapabilityMajor_{0};
+  bool fp8NvlsSupported_{false};
 };
 
 }  // namespace collective
diff --git a/src/ext/collectives/include/collective_utils.hpp b/src/ext/collectives/include/collective_utils.hpp
@@ -39,6 +39,10 @@ constexpr int MAX_IPC_DOMAIN_NRANKS = 72;
 
 constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70;  // Two 70 MiB buffers for double-buffered packet scratch space.
 
+bool isFp8DataType(DataType dtype);
+bool isNativeFp8DataType(DataType dtype);
+bool isFp8NvlsSupported();
+
 std::vector<RegisteredMemory> setupRemoteMemories(std::shared_ptr<Communicator> comm, int rank,
                                                   RegisteredMemory localMemory);
 
diff --git a/src/ext/nccl/CMakeLists.txt b/src/ext/nccl/CMakeLists.txt
@@ -13,6 +13,7 @@ target_include_directories(mscclpp_nccl PRIVATE
     include 
     ${PROJECT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/src/core/include
+    ${PROJECT_SOURCE_DIR}/src/ext/collectives/include
     ${GPU_INCLUDE_DIRS}
 )
 target_link_libraries(mscclpp_nccl PUBLIC mscclpp mscclpp_collectives)
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
@@ -6,6 +6,7 @@
 #include <mscclpp/env.hpp>
 #include <mscclpp/utils.hpp>
 
+#include "collective_utils.hpp"
 #include "debug.h"
 
 namespace mscclpp {
@@ -20,24 +21,15 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
     return false;
   }
 
-  const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
-                     dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
-
-  if (!isFp8) {
+  if (!collective::isFp8DataType(dtype)) {
     return nvlsSupported;
   }
 
-  // FP8 handling
 #if !defined(__HIP_PLATFORM_AMD__)
-  // NVLS does not support FP8 on devices with compute capability < 10
-  if (config.computeCapability.first < 10) {
+  if (!collective::isNativeFp8DataType(dtype)) {
     return false;
   }
-#if (defined(__CUDA_ARCH_SPECIFIC__) || defined(__CUDA_ARCH_FAMILY_SPECIFIC__))
-  return true;
-#else
-  return false;
-#endif
+  return nvlsSupported && config.fp8NvlsSupported;
 #else
   return nvlsSupported;
 #endif
diff --git a/src/ext/nccl/algorithm_selector.hpp b/src/ext/nccl/algorithm_selector.hpp
@@ -16,6 +16,7 @@ namespace nccl {
 struct AlgorithmSelectorConfig {
   bool symmetricMemory;
   bool nvlsSupported;
+  bool fp8NvlsSupported;
   bool isCuMemMapAllocated;
   bool inCaptureMode;
   std::pair<int, int> computeCapability;
diff --git a/src/ext/nccl/nccl.cc b/src/ext/nccl/nccl.cc
@@ -20,6 +20,7 @@
 #include <mscclpp/algorithm.hpp>
 
 #include "algorithm_selector.hpp"
+#include "collective_utils.hpp"
 #include "datatype_conversion.hpp"
 
 static constexpr auto MSCCLPP_NCCL = mscclpp::LogSubsys::NCCL;
@@ -239,6 +240,8 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
   static const bool isNvlsSupported = mscclpp::isNvlsSupported();
   static const std::pair<int, int> deviceComputeCapability = getDeviceComputeCapability();
   static const bool ncclSymmetricMemory = mscclpp::env()->ncclSymmetricMemory;
+  const bool fp8NvlsSupported =
+      mscclpp::collective::isNativeFp8DataType(request.dtype) ? mscclpp::collective::isFp8NvlsSupported() : false;
 
   const bool isCuMemMapAllocated = mscclpp::isCuMemMapAllocated(const_cast<void*>(request.inputBuffer)) &&
                                    mscclpp::isCuMemMapAllocated(request.outputBuffer);
@@ -249,6 +252,7 @@ static std::shared_ptr<mscclpp::Algorithm> algoSelector(
 
   mscclpp::nccl::AlgorithmSelectorConfig config{.symmetricMemory = ncclSymmetricMemory,
                                                 .nvlsSupported = isNvlsSupported,
+                                                .fp8NvlsSupported = fp8NvlsSupported,
                                                 .isCuMemMapAllocated = isCuMemMapAllocated,
                                                 .inCaptureMode = inCaptureMode,
                                                 .computeCapability = deviceComputeCapability,

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ target_include_directories(mscclpp_nccl PRIVATE`
`13`	`13`	`include`
`14`	`14`	`${PROJECT_SOURCE_DIR}/include`
`15`	`15`	`${PROJECT_SOURCE_DIR}/src/core/include`
	`16`	`+ ${PROJECT_SOURCE_DIR}/src/ext/collectives/include`
`16`	`17`	`${GPU_INCLUDE_DIRS}`
`17`	`18`	`)`
`18`	`19`	`target_link_libraries(mscclpp_nccl PUBLIC mscclpp mscclpp_collectives)`