66#include " allreduce/allreduce_nvls_zero_copy.hpp"
77#include " allreduce/common.hpp"
88#include " collective_utils.hpp"
9- #include " debug.h "
9+ #include " logger.hpp "
1010
1111namespace mscclpp {
1212namespace collective {
@@ -116,17 +116,17 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
116116 [[maybe_unused]] const std::unordered_map<std::string, uintptr_t >& extras,
117117 mscclpp::DataType accumDtype) {
118118 if (!symmetricMemory_) {
119- WARN (" AllreduceNvls requires symmetric memory for now ." );
119+ WARN (ALGO, " AllreduceNvls requires symmetric memory." );
120120 return CommResult::CommInvalidArgument;
121121 }
122122 auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
123123 if (isNativeFp8DataType (dtype) && !fp8NvlsSupported_) {
124- WARN (" FP8 NVLS allreduce requires device support for FP8 multimem reduction." );
124+ WARN (ALGO, " FP8 NVLS allreduce requires device support for FP8 multimem reduction." );
125125 return CommResult::CommInvalidArgument;
126126 }
127127 AllreduceFunc allreduce = dispatch<NvlsAdapter>(op, dtype, accumDtype);
128128 if (!allreduce) {
129- WARN (" Unsupported operation or data type for allreduce, dtype=%d " , static_cast <int >(dtype));
129+ WARN (ALGO, " Unsupported operation or data type for allreduce, dtype=" , static_cast <int >(dtype));
130130 return CommResult::CommInvalidArgument;
131131 }
132132 size_t sendBytes, recvBytes;
@@ -151,7 +151,7 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
151151 }
152152 }
153153 if (numBlocksAndThreads.first > MAX_NBLOCKS) {
154- WARN (" Number of blocks exceeds maximum supported value of %d " , MAX_NBLOCKS);
154+ WARN (ALGO, " Number of blocks exceeds maximum supported value of " , MAX_NBLOCKS);
155155 return CommResult::CommInvalidArgument;
156156 }
157157 cudaError_t error = allreduce (nullptr , nullptr , nullptr , this ->memoryChannelsDeviceHandle_ .get (), nullptr ,
@@ -160,10 +160,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr<void> ctx_vo
160160 numBlocksAndThreads.first , numBlocksAndThreads.second );
161161 if (error != cudaSuccess) {
162162 if (error == cudaErrorNotSupported) {
163- WARN (" AllreduceNvls does not support the requested data type." );
163+ WARN (ALGO, " AllreduceNvls does not support the requested data type." );
164164 return CommResult::CommInvalidArgument;
165165 }
166- WARN (" AllreduceNvls failed with error: %s " , cudaGetErrorString (error));
166+ WARN (ALGO, " AllreduceNvls failed with error: " , cudaGetErrorString (error));
167167 return CommResult::CommUnhandledCudaError;
168168 }
169169 return CommResult::CommSuccess;
@@ -176,6 +176,9 @@ mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void*
176176 CUdeviceptr sendBasePtr, recvBasePtr;
177177 MSCCLPP_CUTHROW (cuMemGetAddressRange (&sendBasePtr, &sendBytes, (CUdeviceptr)input));
178178 MSCCLPP_CUTHROW (cuMemGetAddressRange (&recvBasePtr, &recvBytes, (CUdeviceptr)output));
179+ INFO (ALGO, " Generated context key with sendBasePtr=" , (void *)sendBasePtr, " , recvBasePtr=" , (void *)recvBasePtr,
180+ " , sendBytes=" , sendBytes, " , recvBytes=" , recvBytes, " , input offset=" , (char *)input - (char *)sendBasePtr,
181+ " , output offset=" , (char *)output - (char *)recvBasePtr);
179182 return mscclpp::AlgorithmCtxKey{(void *)sendBasePtr, (void *)recvBasePtr, sendBytes, recvBytes, 0 };
180183}
181184
0 commit comments