@@ -317,6 +317,16 @@ bool isNvlsSupported();
317317// / @return True if the pointer is allocated by cuMemMap, false otherwise.
318318bool isCuMemMapAllocated (void * ptr);
319319
320+ // / Granularity used to size a `GpuBuffer` allocation so that it is compatible with the multicast (NVLS) API.
321+ enum class GpuBufferGranularity {
322+ // / Minimum multicast granularity. Rounds the allocation up to the minimum granularity required for multicast
323+ // / compatibility, minimizing memory footprint. This is the default.
324+ MultiCastMinimum,
325+ // / Recommended multicast granularity. Rounds the allocation up to the granularity recommended by the driver,
326+ // / which may be larger than the minimum but can yield better performance.
327+ MultiCastRecommended,
328+ };
329+
320330// / Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
321331// / `GpuBuffer::data()`.
322332// /
@@ -334,16 +344,22 @@ class GpuBuffer {
334344 public:
335345 // / Constructs a GpuBuffer with the specified number of elements.
336346 // / @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
337- GpuBuffer (size_t nelems) : nelems_(nelems) {
347+ // / @param granularity Granularity used to size the allocation for multicast (NVLS) compatibility. Defaults to
348+ // / `GpuBufferGranularity::MultiCastMinimum`, which minimizes memory usage. This is ignored when the buffer is not
349+ // / allocated through the multicast-compatible path.
350+ GpuBuffer (size_t nelems, [[maybe_unused]] GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum)
351+ : nelems_(nelems) {
338352 if (nelems == 0 ) {
339353 bytes_ = 0 ;
340354 return ;
341355 }
342356 MSCCLPP_CUDATHROW (cudaGetDevice (&deviceId_));
343357#if (CUDA_NVLS_API_AVAILABLE)
344358 if (isNvlsSupported ()) {
345- // TODO: pass granularity from the caller instead of using the minimum granularity.
346- size_t gran = detail::getMulticastGranularity (nelems * sizeof (T), CU_MULTICAST_GRANULARITY_MINIMUM);
359+ CUmulticastGranularity_flags granFlag = (granularity == GpuBufferGranularity::MultiCastRecommended)
360+ ? CU_MULTICAST_GRANULARITY_RECOMMENDED
361+ : CU_MULTICAST_GRANULARITY_MINIMUM;
362+ size_t gran = detail::getMulticastGranularity (nelems * sizeof (T), granFlag);
347363 bytes_ = (nelems * sizeof (T) + gran - 1 ) / gran * gran / sizeof (T) * sizeof (T);
348364 memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
349365 return ;
0 commit comments