Skip to content

Commit 4016f22

Browse files
Binyang2014Copilot
andcommitted
Expose NVLS multicast granularity option for GpuBuffer
Add a public Granularity enum (MultiCastMinimum, MultiCastRecommended) and let GpuBuffer choose the NVLS multicast allocation granularity via a constructor argument, defaulting to MultiCastMinimum to minimize memory usage. Expose the same option through the C++ and Python (nanobind) APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent c9f8be6 commit 4016f22

5 files changed

Lines changed: 39 additions & 7 deletions

File tree

docs/cpp_api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ Utilities
128128
.. doxygenclass:: mscclpp::GpuBuffer
129129
:members:
130130

131+
.. doxygenenum:: mscclpp::GpuBufferGranularity
132+
131133
.. doxygenclass:: mscclpp::GpuStream
132134
:members:
133135

include/mscclpp/gpu_utils.hpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,16 @@ bool isNvlsSupported();
317317
/// @return True if the pointer is allocated by cuMemMap, false otherwise.
318318
bool isCuMemMapAllocated(void* ptr);
319319

320+
/// Granularity used to size a `GpuBuffer` allocation so that it is compatible with the multicast (NVLS) API.
321+
enum class GpuBufferGranularity {
322+
/// Minimum multicast granularity. Rounds the allocation up to the minimum granularity required for multicast
323+
/// compatibility, minimizing memory footprint. This is the default.
324+
MultiCastMinimum,
325+
/// Recommended multicast granularity. Rounds the allocation up to the granularity recommended by the driver,
326+
/// which may be larger than the minimum but can yield better performance.
327+
MultiCastRecommended,
328+
};
329+
320330
/// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
321331
/// `GpuBuffer::data()`.
322332
///
@@ -334,15 +344,22 @@ class GpuBuffer {
334344
public:
335345
/// Constructs a GpuBuffer with the specified number of elements.
336346
/// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
337-
GpuBuffer(size_t nelems) : nelems_(nelems) {
347+
/// @param granularity Granularity used to size the allocation for multicast (NVLS) compatibility. Defaults to
348+
/// `GpuBufferGranularity::MultiCastMinimum`, which minimizes memory usage. This is ignored when the buffer is not
349+
/// allocated through the multicast-compatible path.
350+
GpuBuffer(size_t nelems, [[maybe_unused]] GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum)
351+
: nelems_(nelems) {
338352
if (nelems == 0) {
339353
bytes_ = 0;
340354
return;
341355
}
342356
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
343357
#if (CUDA_NVLS_API_AVAILABLE)
344358
if (isNvlsSupported()) {
345-
size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
359+
CUmulticastGranularity_flags granFlag = (granularity == GpuBufferGranularity::MultiCastRecommended)
360+
? CU_MULTICAST_GRANULARITY_RECOMMENDED
361+
: CU_MULTICAST_GRANULARITY_MINIMUM;
362+
size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), granFlag);
346363
bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
347364
memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
348365
return;

python/csrc/gpu_utils_py.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
114114
void register_gpu_utils(nb::module_& m) {
115115
m.def("is_nvls_supported", &isNvlsSupported);
116116

117+
nb::enum_<GpuBufferGranularity>(m, "CppGpuBufferGranularity")
118+
.value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum)
119+
.value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended);
120+
117121
nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
118-
.def(nb::init<size_t>(), nb::arg("nelems"))
122+
.def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("nelems"),
123+
nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
119124
.def("nelems", &GpuBuffer<char>::nelems)
120125
.def("bytes", &GpuBuffer<char>::bytes)
121126
.def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); })

python/mscclpp/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
"AlgorithmCollection",
101101
"CommGroup",
102102
"GpuBuffer",
103+
"GpuBufferGranularity",
103104
]
104105

105106

python/mscclpp/_core/buffer.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,21 @@
66

77
import cupy as cp
88
import numpy as np
9-
from mscclpp._mscclpp import CppRawGpuBuffer
9+
from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity
1010

11-
__all__ = ["GpuBuffer"]
11+
__all__ = ["GpuBuffer", "GpuBufferGranularity"]
12+
13+
GpuBufferGranularity = CppGpuBufferGranularity
1214

1315

1416
class GpuBuffer(cp.ndarray):
1517
def __new__(
16-
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
18+
cls,
19+
shape: Union[int, Tuple[int]],
20+
dtype: cp.dtype = float,
21+
strides: Tuple[int] = None,
22+
order: str = "C",
23+
granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
1724
):
1825
# Check if `shape` is valid
1926
if isinstance(shape, int):
@@ -25,6 +32,6 @@ def __new__(
2532
if any(s <= 0 for s in shape):
2633
raise ValueError("Shape must be positive.")
2734
# Create the buffer
28-
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
35+
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
2936
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
3037
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)

0 commit comments

Comments
 (0)