lint and log

Binyang2014 · Binyang2014 · commit d754c3564854 · 2026-06-06T00:42:01.000Z
diff --git a/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_zero_copy.cu
@@ -6,7 +6,7 @@
 #include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
-#include "debug.h"
+#include "logger.hpp"
 
 namespace mscclpp {
 namespace collective {
@@ -116,17 +116,17 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                               [[maybe_unused]] const std::unordered_map<std::string, uintptr_t>& extras,
                                               mscclpp::DataType accumDtype) {
   if (!symmetricMemory_) {
-    WARN("AllreduceNvls requires symmetric memory for now.");
+    WARN(ALGO, "AllreduceNvls requires symmetric memory.");
     return CommResult::CommInvalidArgument;
   }
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
   if (isNativeFp8DataType(dtype) && !fp8NvlsSupported_) {
-    WARN("FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
+    WARN(ALGO, "FP8 NVLS allreduce requires device support for FP8 multimem reduction.");
     return CommResult::CommInvalidArgument;
   }
   AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
   if (!allreduce) {
-    WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
+    WARN(ALGO, "Unsupported operation or data type for allreduce, dtype=", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
   }
   size_t sendBytes, recvBytes;
@@ -151,7 +151,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
     }
   }
   if (numBlocksAndThreads.first > MAX_NBLOCKS) {
-    WARN("Number of blocks exceeds maximum supported value of %d", MAX_NBLOCKS);
+    WARN(ALGO, "Number of blocks exceeds maximum supported value of ", MAX_NBLOCKS);
     return CommResult::CommInvalidArgument;
   }
   cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr,
@@ -160,10 +160,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
                                 numBlocksAndThreads.first, numBlocksAndThreads.second);
   if (error != cudaSuccess) {
     if (error == cudaErrorNotSupported) {
-      WARN("AllreduceNvls does not support the requested data type.");
+      WARN(ALGO, "AllreduceNvls does not support the requested data type.");
       return CommResult::CommInvalidArgument;
     }
-    WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error));
+    WARN(ALGO, "AllreduceNvls failed with error: ", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
@@ -176,6 +176,9 @@ mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void*
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input));
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output));
+  INFO(ALGO, "Generated context key with sendBasePtr=", (void*)sendBasePtr, ", recvBasePtr=", (void*)recvBasePtr,
+       ", sendBytes=", sendBytes, ", recvBytes=", recvBytes, ", input offset=", (char*)input - (char*)sendBasePtr,
+       ", output offset=", (char*)output - (char*)recvBasePtr);
   return mscclpp::AlgorithmCtxKey{(void*)sendBasePtr, (void*)recvBasePtr, sendBytes, recvBytes, 0};
 }
 
diff --git a/src/ext/collectives/allreduce/allreduce_rsag.cu b/src/ext/collectives/allreduce/allreduce_rsag.cu
@@ -53,8 +53,7 @@ __global__ void __launch_bounds__(1024, 1)
   int4* resultBuff4 = reinterpret_cast<int4*>((char*)resultBuff);
   int4* buff4 = reinterpret_cast<int4*>((char*)buff);
   DeviceHandle<BaseMemoryChannel>* memoryChannelsLocal = memoryChannels + blockId * nPeers;
-  using AccumVec =
-      std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nelemsPerInt4>>;
+  using AccumVec = std::conditional_t<std::is_same_v<T, AccumT>, int4, mscclpp::VectorType<AccumT, nelemsPerInt4>>;
 
   uint32_t nInt4PerBlock = nInt4PerRank / gridDim.x;
   uint32_t remainderForBlock = nInt4PerRank % gridDim.x;
diff --git a/src/ext/collectives/collective_utils.cu b/src/ext/collectives/collective_utils.cu
@@ -1,15 +1,15 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
-#include "collective_utils.hpp"
-
 #include <algorithm>
 #include <mscclpp/algorithm.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/memory_channel.hpp>
 #include <mscclpp/switch_channel.hpp>
 
+#include "collective_utils.hpp"
+
 namespace mscclpp {
 namespace collective {
 
@@ -37,8 +37,8 @@ bool detectFp8NvlsSupport() {
     return false;
   }
 
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(&supportedHost, supportedDevice.get(), sizeof(supportedHost),
-                                    cudaMemcpyDeviceToHost, stream));
+  MSCCLPP_CUDATHROW(
+      cudaMemcpyAsync(&supportedHost, supportedDevice.get(), sizeof(supportedHost), cudaMemcpyDeviceToHost, stream));
   err = cudaStreamSynchronize(stream);
   if (err != cudaSuccess) {
     (void)cudaGetLastError();
@@ -51,9 +51,8 @@ bool detectFp8NvlsSupport() {
 }  // namespace
 
 bool isFp8DataType(DataType dtype) {
-  return dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
-         dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ ||
-         dtype == DataType::FLOAT8_E4M3B15;
+  return dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || dtype == DataType::FLOAT8_E5M2 ||
+         dtype == DataType::FLOAT8_E5M2FNUZ || dtype == DataType::FLOAT8_E4M3B15;
 }
 
 bool isNativeFp8DataType(DataType dtype) {