microsoft · Binyang2014 · Jun 4, 2026 · Jun 4, 2026
diff --git a/docs/cpp_api.rst b/docs/cpp_api.rst
@@ -128,6 +128,8 @@ Utilities
 .. doxygenclass:: mscclpp::GpuBuffer
    :members:
 
+.. doxygenenum:: mscclpp::GpuBufferGranularity
+
 .. doxygenclass:: mscclpp::GpuStream
    :members:
 

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -317,6 +317,16 @@ bool isNvlsSupported();
 /// @return True if the pointer is allocated by cuMemMap, false otherwise.
 bool isCuMemMapAllocated(void* ptr);
 
+/// Granularity used to size a `GpuBuffer` allocation so that it is compatible with the multicast (NVLS) API.
+enum class GpuBufferGranularity {
+  /// Minimum multicast granularity. Rounds the allocation up to the minimum granularity required for multicast
+  /// compatibility, minimizing memory footprint. This is the default.
+  MultiCastMinimum,
+  /// Recommended multicast granularity. Rounds the allocation up to the granularity recommended by the driver,
+  /// which may be larger than the minimum but can yield better performance.
+  MultiCastRecommended,
+};
+
 /// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
 /// `GpuBuffer::data()`.
 ///
@@ -334,15 +344,22 @@ class GpuBuffer {
  public:
   /// Constructs a GpuBuffer with the specified number of elements.
   /// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
-  GpuBuffer(size_t nelems) : nelems_(nelems) {
+  /// @param granularity Granularity used to size the allocation for multicast (NVLS) compatibility. Defaults to
+  /// `GpuBufferGranularity::MultiCastMinimum`, which minimizes memory usage. This is ignored when the buffer is not
+  /// allocated through the multicast-compatible path.
+  GpuBuffer(size_t nelems, [[maybe_unused]] GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum)
+      : nelems_(nelems) {
     if (nelems == 0) {
       bytes_ = 0;
       return;
     }
     MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
 #if (CUDA_NVLS_API_AVAILABLE)
     if (isNvlsSupported()) {
-      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
+      CUmulticastGranularity_flags granFlag = (granularity == GpuBufferGranularity::MultiCastRecommended)
+                                                  ? CU_MULTICAST_GRANULARITY_RECOMMENDED
+                                                  : CU_MULTICAST_GRANULARITY_MINIMUM;
+      size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), granFlag);
       bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
       memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
       return;

diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp
@@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
 void register_gpu_utils(nb::module_& m) {
   m.def("is_nvls_supported", &isNvlsSupported);
 
+  nb::enum_<GpuBufferGranularity>(m, "CppGpuBufferGranularity")
+      .value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum)
+      .value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended);
+
   nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
-      .def(nb::init<size_t>(), nb::arg("nelems"))
+      .def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("nelems"),
+           nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
       .def("nelems", &GpuBuffer<char>::nelems)
       .def("bytes", &GpuBuffer<char>::bytes)
       .def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); })

diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
@@ -100,6 +100,7 @@
     "AlgorithmCollection",
     "CommGroup",
     "GpuBuffer",
+    "GpuBufferGranularity",
 ]
 
 

diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py
@@ -6,14 +6,21 @@
 
 import cupy as cp
 import numpy as np
-from mscclpp._mscclpp import CppRawGpuBuffer
+from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity
 
-__all__ = ["GpuBuffer"]
+__all__ = ["GpuBuffer", "GpuBufferGranularity"]
+
+GpuBufferGranularity = CppGpuBufferGranularity
 
 
 class GpuBuffer(cp.ndarray):
     def __new__(
-        cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
+        cls,
+        shape: Union[int, Tuple[int]],
+        dtype: cp.dtype = float,
+        strides: Tuple[int] = None,
+        order: str = "C",
+        granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
     ):
         # Check if `shape` is valid
         if isinstance(shape, int):
@@ -25,6 +32,6 @@ def __new__(
         if any(s <= 0 for s in shape):
             raise ValueError("Shape must be positive.")
         # Create the buffer
-        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
+        buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
         memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
         return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)