Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/cpp_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ Utilities
.. doxygenclass:: mscclpp::GpuBuffer
:members:

.. doxygenenum:: mscclpp::GpuBufferGranularity

.. doxygenclass:: mscclpp::GpuStream
:members:

Expand Down
21 changes: 19 additions & 2 deletions include/mscclpp/gpu_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,16 @@ bool isNvlsSupported();
/// @return True if the pointer is allocated by cuMemMap, false otherwise.
bool isCuMemMapAllocated(void* ptr);

/// Granularity used to size a `GpuBuffer` allocation so that it is compatible with the multicast (NVLS) API.
enum class GpuBufferGranularity {
/// Minimum multicast granularity. Rounds the allocation up to the minimum granularity required for multicast
/// compatibility, minimizing memory footprint. This is the default.
MultiCastMinimum,
/// Recommended multicast granularity. Rounds the allocation up to the granularity recommended by the driver,
/// which may be larger than the minimum but can yield better performance.
MultiCastRecommended,
};

/// Allocates a GPU memory space specialized for communication. The memory is zeroed out. Get the device pointer by
/// `GpuBuffer::data()`.
///
Expand All @@ -334,15 +344,22 @@ class GpuBuffer {
public:
/// Constructs a GpuBuffer with the specified number of elements.
/// @param nelems Number of elements to allocate. If it is zero, `data()` will return a null pointer.
GpuBuffer(size_t nelems) : nelems_(nelems) {
/// @param granularity Granularity used to size the allocation for multicast (NVLS) compatibility. Defaults to
/// `GpuBufferGranularity::MultiCastMinimum`, which minimizes memory usage. This is ignored when the buffer is not
/// allocated through the multicast-compatible path.
GpuBuffer(size_t nelems, [[maybe_unused]] GpuBufferGranularity granularity = GpuBufferGranularity::MultiCastMinimum)
: nelems_(nelems) {
if (nelems == 0) {
bytes_ = 0;
return;
}
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_));
#if (CUDA_NVLS_API_AVAILABLE)
if (isNvlsSupported()) {
size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), CU_MULTICAST_GRANULARITY_RECOMMENDED);
CUmulticastGranularity_flags granFlag = (granularity == GpuBufferGranularity::MultiCastRecommended)
? CU_MULTICAST_GRANULARITY_RECOMMENDED
: CU_MULTICAST_GRANULARITY_MINIMUM;
size_t gran = detail::getMulticastGranularity(nelems * sizeof(T), granFlag);
bytes_ = (nelems * sizeof(T) + gran - 1) / gran * gran / sizeof(T) * sizeof(T);
memory_ = detail::gpuCallocPhysicalShared<T>(nelems, gran);
return;
Expand Down
7 changes: 6 additions & 1 deletion python/csrc/gpu_utils_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,13 @@ static nb::capsule toDlpack(GpuBuffer<char> buffer, std::string dataType, std::v
void register_gpu_utils(nb::module_& m) {
m.def("is_nvls_supported", &isNvlsSupported);

nb::enum_<GpuBufferGranularity>(m, "CppGpuBufferGranularity")
.value("MultiCastMinimum", GpuBufferGranularity::MultiCastMinimum)
.value("MultiCastRecommended", GpuBufferGranularity::MultiCastRecommended);

nb::class_<GpuBuffer<char>>(m, "CppRawGpuBuffer")
.def(nb::init<size_t>(), nb::arg("nelems"))
.def(nb::init<size_t, GpuBufferGranularity>(), nb::arg("nelems"),
nb::arg("granularity") = GpuBufferGranularity::MultiCastMinimum)
.def("nelems", &GpuBuffer<char>::nelems)
.def("bytes", &GpuBuffer<char>::bytes)
.def("data", [](GpuBuffer<char>& self) { return reinterpret_cast<uintptr_t>(self.data()); })
Expand Down
1 change: 1 addition & 0 deletions python/mscclpp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"AlgorithmCollection",
"CommGroup",
"GpuBuffer",
"GpuBufferGranularity",
]


Expand Down
15 changes: 11 additions & 4 deletions python/mscclpp/_core/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@

import cupy as cp
import numpy as np
from mscclpp._mscclpp import CppRawGpuBuffer
from mscclpp._mscclpp import CppRawGpuBuffer, CppGpuBufferGranularity

__all__ = ["GpuBuffer"]
__all__ = ["GpuBuffer", "GpuBufferGranularity"]

GpuBufferGranularity = CppGpuBufferGranularity


class GpuBuffer(cp.ndarray):
def __new__(
cls, shape: Union[int, Tuple[int]], dtype: cp.dtype = float, strides: Tuple[int] = None, order: str = "C"
cls,
shape: Union[int, Tuple[int]],
dtype: cp.dtype = float,
strides: Tuple[int] = None,
order: str = "C",
granularity: CppGpuBufferGranularity = CppGpuBufferGranularity.MultiCastMinimum,
):
# Check if `shape` is valid
if isinstance(shape, int):
Expand All @@ -25,6 +32,6 @@ def __new__(
if any(s <= 0 for s in shape):
raise ValueError("Shape must be positive.")
# Create the buffer
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize)
buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize, granularity)
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0)
return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr)
Loading