Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
97 commits
Select commit Hold shift + click to select a range
e711b62
Initial plan
Copilot Feb 11, 2026
c881bc5
Replace gtest/gtest.h with framework.hpp in all unit tests
Copilot Feb 11, 2026
e227fdc
Convert mp_unit tests from gtest to framework.hpp
Copilot Feb 11, 2026
1e32e17
Address code review comments
Copilot Feb 11, 2026
eafa6fb
Add custom test framework and code coverage support
Copilot Feb 11, 2026
3d8a2e7
Add --gtest_filter support to framework
Copilot Feb 11, 2026
a10aff5
Address code review feedback
Copilot Feb 11, 2026
1818709
Fix CodeQL workflow by disabling test builds
Copilot Feb 11, 2026
5657e4a
Initial plan for fixing test build with GPU bypass
Copilot Feb 11, 2026
0eae34c
Fix test framework for building with Docker
Copilot Feb 11, 2026
4823583
Move FailHelper and SkipHelper into mscclpp::test namespace
Copilot Feb 11, 2026
403b2fb
Remove unnecessary CMake build artifacts from PR
Copilot Feb 11, 2026
305d157
Remove PerfTestResult and reuse TestResult directly
Copilot Feb 11, 2026
b1f458e
Convert test framework identifiers from snake_case to camelCase
Copilot Feb 11, 2026
6da12fa
Comprehensive plan for refactoring
Copilot Feb 11, 2026
7e4365f
Add performance test filtering and remove HTML coverage
Copilot Feb 11, 2026
b59196b
Integrate perf tests into unit_tests and add CI coverage step
Copilot Feb 11, 2026
ba0451a
Remove build2 CMake artifacts from repository
Copilot Feb 11, 2026
50f6a24
Remove test/perf/ directory completely
Copilot Feb 11, 2026
e26f8ab
Address PR review comments
Copilot Feb 11, 2026
7003fec
Simplify filter matching to use substring matching
Copilot Feb 11, 2026
30b9891
simplifying
chhwang Feb 19, 2026
b6ce0f2
simplify
chhwang Feb 19, 2026
d2efc2f
coverage update
chhwang Feb 19, 2026
4afbf78
minor
chhwang Feb 19, 2026
e40c72b
license text update
chhwang Feb 19, 2026
bed85b5
codecov upload
chhwang Feb 19, 2026
4d9acea
badge
chhwang Feb 19, 2026
b693d1b
lint issue
chhwang Feb 19, 2026
2b4adcc
fix lint
chhwang Feb 19, 2026
b64536f
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 19, 2026
dcdd3fe
update UT CI
chhwang Feb 20, 2026
caeec75
updates
chhwang Feb 20, 2026
b9609f8
add coverage flags
chhwang Feb 20, 2026
41695ba
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 20, 2026
febdbf9
WIP; need amd fix
chhwang Feb 21, 2026
c4afbe1
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 23, 2026
04ebd9b
fix coverage file path
chhwang Feb 23, 2026
54e46ba
rocm fix wip
chhwang Feb 23, 2026
6c2bc8f
coverage fix
chhwang Feb 23, 2026
d0c709e
Fix Codecov token usage in coverage upload step
chhwang Feb 23, 2026
edda25d
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 23, 2026
2f02d38
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 24, 2026
2adf4a4
use variable group
chhwang Feb 24, 2026
98b023a
rocm fixes
chhwang Feb 24, 2026
22e5efb
gdrcopy install in container
chhwang Feb 24, 2026
2f27d7d
Update coverage report to exclude additional directories in lcov command
chhwang Feb 24, 2026
d88ee8d
Refine coverage report to include only mscclpp source and include dir…
chhwang Feb 24, 2026
11e27e2
Update coverage report commands to handle errors and adjust paths
chhwang Feb 24, 2026
25f31b4
updates
chhwang Feb 24, 2026
75dfdd9
Merge branch 'main' into chhwang/fix-ib-no-atomic
chhwang Feb 24, 2026
ac4d713
updates
chhwang Feb 24, 2026
ac022c3
a few updates
chhwang Feb 25, 2026
72407af
License
chhwang Feb 25, 2026
8effd97
License
chhwang Feb 25, 2026
fd7358d
License, lint
chhwang Feb 25, 2026
67d1706
optimized recv loop
chhwang Feb 26, 2026
060982d
updates
chhwang Feb 26, 2026
6b2f819
Merge branch 'main' into chhwang/fix-ib-no-atomic
chhwang Feb 26, 2026
eb99a26
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Feb 27, 2026
8c3a436
update CI
chhwang Feb 27, 2026
f4b8574
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Mar 3, 2026
3b56b08
data direct
chhwang Mar 4, 2026
448ceb6
updates
chhwang Mar 5, 2026
7ce841b
Updates
chhwang Mar 5, 2026
bbb9c10
Update Docker image
chhwang Mar 6, 2026
60ff32c
updates
chhwang Mar 6, 2026
00583da
separate pipeline for codecov
chhwang Mar 6, 2026
c699b8a
az pipeline refactoring
chhwang Mar 7, 2026
284d913
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Mar 7, 2026
75ac8be
fix
chhwang Mar 7, 2026
e0c7ddb
fix
chhwang Mar 7, 2026
c40a233
fix
chhwang Mar 7, 2026
375bc13
fix
chhwang Mar 7, 2026
bcb392f
updates
chhwang Mar 8, 2026
ea1dd65
fix
chhwang Mar 8, 2026
d6a6fa2
simplified
chhwang Mar 8, 2026
a9cf938
fix
chhwang Mar 9, 2026
6647338
debugging
chhwang Mar 10, 2026
7a87c2c
debugging
chhwang Mar 10, 2026
cf505d7
debugging
chhwang Mar 10, 2026
757c0ec
debugging
chhwang Mar 11, 2026
e2a5be4
debugging
chhwang Mar 11, 2026
2a705f5
fix merge
chhwang Mar 11, 2026
a38bd9d
Merge branch 'main' into copilot/remove-gtest-use-custom-framework
chhwang Mar 11, 2026
e2a9692
fix merge
chhwang Mar 11, 2026
2c4bab8
fix
chhwang Mar 16, 2026
a937ce4
debugging
chhwang Mar 16, 2026
d66d7e4
debugging
chhwang Mar 17, 2026
5a65cc7
debugging
chhwang Mar 17, 2026
2297a3d
updates
chhwang Mar 18, 2026
2756221
update
chhwang Mar 18, 2026
bff76d5
Fix TearDown() handling and replace assert() in perf tests
Copilot Mar 18, 2026
6082648
fix for npkit
chhwang Mar 18, 2026
79a0149
updates
chhwang Mar 18, 2026
0200532
Merge branch 'copilot/remove-gtest-use-custom-framework' into chhwang…
chhwang Mar 18, 2026
80f554e
Merge branch 'main' into chhwang/fix-ib-no-atomic
chhwang Mar 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT License.

cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CXX)
Expand Down Expand Up @@ -170,6 +170,21 @@ if(MSCCLPP_USE_IB)
endif()
find_package(NUMA REQUIRED)
find_package(Threads REQUIRED)

option(MSCCLPP_USE_GDRCOPY "Use GDRCopy for direct GPU memory access from host." ON)
if(MSCCLPP_USE_ROCM)
set(MSCCLPP_USE_GDRCOPY OFF)
endif()
if(MSCCLPP_USE_GDRCOPY)
find_package(GDRCopy)
if(NOT GDRCOPY_FOUND)
message(STATUS "GDRCopy not found, disabling GDRCopy support")
set(MSCCLPP_USE_GDRCOPY OFF)
else()
message(STATUS "GDRCopy found: ${GDRCOPY_LIBRARIES}")
endif()
endif()

include(FetchContent)
FetchContent_Declare(json
GIT_REPOSITORY https://github.com/nlohmann/json.git
Expand Down
37 changes: 37 additions & 0 deletions cmake/FindGDRCopy.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

# Find the GDRCopy libraries
#
# The following variables are optionally searched for defaults
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found

# The following are set after configuration is done:
# GDRCOPY_FOUND
# GDRCOPY_INCLUDE_DIRS
# GDRCOPY_LIBRARIES

find_path(GDRCOPY_INCLUDE_DIRS
NAMES gdrapi.h
HINTS
${GDRCOPY_INCLUDE_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/include
/usr/local/include
/usr/include)

find_library(GDRCOPY_LIBRARIES
NAMES gdrapi
HINTS
${GDRCOPY_LIB_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/lib
/usr/local/lib
/usr/lib
/usr/lib/x86_64-linux-gnu)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
mark_as_advanced(GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
19 changes: 18 additions & 1 deletion docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,25 @@ RUN OS_ARCH=$(uname -m) && \
rm -rf ${CMAKE_HOME}.tar.gz && \
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/

# Install ROCm-specific packages if building for ROCm
# Install GDRCopy userspace library for CUDA targets
ARG TARGET="cuda13.0"
RUN if echo "$TARGET" | grep -q "^cuda"; then \
GDRCOPY_VERSION="2.5.1" && \
apt-get update -y && \
apt-get install -y --no-install-recommends devscripts debhelper fakeroot pkg-config dkms && \
cd /tmp && \
curl -L https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -o gdrcopy.tar.gz && \
tar xzf gdrcopy.tar.gz && \
cd gdrcopy-${GDRCOPY_VERSION}/packages && \
./build-deb-packages.sh -k -t && \
dpkg -i libgdrapi_*.deb && \
cd / && rm -rf /tmp/gdrcopy* && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*; \
fi

# Install ROCm-specific packages if building for ROCm
RUN if echo "$TARGET" | grep -q "^rocm"; then \
apt-get update -y && \
apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \
Expand Down
5 changes: 5 additions & 0 deletions include/mscclpp/env.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ class Env {
/// Default is false.
const bool forceDisableNvls;

/// Env name: `MSCCLPP_FORCE_DISABLE_GDR`. If set to true, it will disable the GDRCopy support in MSCCL++.
/// When false (default), GDRCopy is auto-detected and enabled if the gdrcopy driver is loaded.
/// Default is false.
const bool forceDisableGdr;

private:
Env();

Expand Down
10 changes: 10 additions & 0 deletions include/mscclpp/semaphore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace mscclpp {
class Host2DeviceSemaphore {
private:
Semaphore semaphore_;
std::shared_ptr<uint64_t> inboundToken_;
detail::UniqueGpuPtr<uint64_t> expectedInboundToken_;
std::unique_ptr<uint64_t> outboundToken_;

Expand All @@ -29,6 +30,15 @@ class Host2DeviceSemaphore {
/// @param connection The connection associated with this semaphore.
Host2DeviceSemaphore(Communicator& communicator, const Connection& connection);

/// Destructor.
~Host2DeviceSemaphore();

/// Move constructor.
Host2DeviceSemaphore(Host2DeviceSemaphore&&) noexcept = default;

/// Move assignment operator.
Host2DeviceSemaphore& operator=(Host2DeviceSemaphore&&) noexcept = default;

/// Returns the connection.
/// @return The connection associated with this semaphore.
Connection& connection();
Expand Down
6 changes: 6 additions & 0 deletions src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ if(MSCCLPP_USE_IB)
target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
endif()

if(MSCCLPP_USE_GDRCOPY)
target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GDRCOPY_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${GDRCOPY_LIBRARIES})
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
endif()

set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})

if(MSCCLPP_USE_CUDA)
Expand Down
131 changes: 93 additions & 38 deletions src/core/connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,13 +197,12 @@ void IBConnection::recvThreadFunc() {
}
}

// Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy)
uint64_t newValueHost = 0;

while (!stopRecvThread_.load(std::memory_order_relaxed)) {
auto qp = qp_.lock();
if (!qp) break;
auto qp = qp_.lock();
if (!qp) return;

while (!stopRecvThread_.load(std::memory_order_relaxed)) {
int wcNum = qp->pollRecvCq();
if (wcNum < 0) {
WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed");
Expand All @@ -220,22 +219,32 @@ void IBConnection::recvThreadFunc() {
continue;
}

// The imm_data contains newValue (32-bit, extended to 64-bit)
// Note: getRecvWcImmData already converts from network byte order via ntohl
unsigned int immData = qp->getRecvWcImmData(i);
newValueHost = static_cast<uint64_t>(immData);
// Read the token value written by the remote sender.
#if defined(DEBUG_CUFLUSH) && defined(MSCCLPP_USE_CUDA)
// cuFlush path: read from imm_data then flush NIC->GPU write pipeline for visibility.
newValueHost = static_cast<uint64_t>(qp->getRecvWcImmData(i));
MSCCLPP_CUTHROW(cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to keep this code here?

#else
// Read the 64-bit token from the local signal GPU buffer via volatile load.
// localSignalGpuPtr_ points to either a GDRCopy BAR1 mapping (CUDA) or the
// GPU buffer directly (ROCm system-coherent/uncached memory). volatile is not
// strictly needed here (uncacheable memory and intervening function calls prevent
// stale reads), but is kept as a convention for NIC-written memory.
newValueHost = *static_cast<volatile uint64_t*>(localSignalGpuPtr_);
#endif

// Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr)
uint64_t dstGpuAddr = remoteUpdateDstAddr_;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit confused about this var. If we use host2hostSemaphore, this addr is host addr?

if (dstGpuAddr != 0) {
uint64_t* dstPtr = reinterpret_cast<uint64_t*>(dstGpuAddr);

// Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream
MSCCLPP_CUDATHROW(
cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_));

INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData,
")");
if (remoteUpdateDstAddrMap_ && remoteUpdateDstAddrMap_->valid()) {
// Direct host-side write to GPU memory via GDRCopy BAR1 mapping
remoteUpdateDstAddrMap_->copyTo(&newValueHost, sizeof(uint64_t));
} else {
*dstPtr = newValueHost;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this valid for CUDA? Maybe we can throw error if the dstAddrMap is invalid for cuda env

}
}

// Post another recv for future messages
Expand All @@ -250,22 +259,63 @@ IBConnection::IBConnection(std::shared_ptr<Context> context, const Endpoint& loc
: BaseConnection(context, localEndpoint),
transport_(localEndpoint.transport()),
remoteTransport_(remoteEndpoint.transport()),
dummyAtomicSource_(std::make_unique<uint64_t>(0)),
atomicSrc_(std::make_unique<uint64_t>(0)),
ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_),
stopRecvThread_(false),
localGpuDeviceId_(localEndpoint.device().id),
signalStream_(nullptr),
remoteUpdateDstAddr_(0) {
remoteUpdateDstAddr_(0),
remoteSignalGpuMrInfo_{0, 0},
localSignalGpuPtr_(nullptr) {
qp_ = getImpl(localEndpoint).ibQp_;
qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_);
qp_.lock()->rts();
dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_);
validateTransport(dummyAtomicSourceMem_, transport_);
dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_);
atomicSrcMem_ = context->registerMemory(atomicSrc_.get(), sizeof(uint64_t), transport_);
validateTransport(atomicSrcMem_, transport_);
atomicSrcTransportInfo_ = getImpl(atomicSrcMem_).getTransportInfo(transport_);

if (ibNoAtomic_) {
// Create a CUDA stream for async memory copies
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking));
#if defined(MSCCLPP_USE_CUDA)
if (!gdrEnabled()) {
std::string reason = "unknown";
switch (gdrStatus()) {
case GdrStatus::NotBuilt:
reason = "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)";
break;
case GdrStatus::Disabled:
reason = "GDRCopy is disabled via MSCCLPP_FORCE_DISABLE_GDR environment variable";
break;
case GdrStatus::DriverMissing:
reason = "GDRCopy kernel driver is not loaded (/dev/gdrdrv not found)";
break;
case GdrStatus::OpenFailed:
reason = "gdr_open() failed; GDRCopy driver may be misconfigured";
break;
default:
break;
}
THROW(CONN, Error, ErrorCode::InvalidUsage, "IB host-no-atomic mode on CUDA requires GDRCopy: ", reason);
}
#endif

// Extract remote endpoint's signal GPU buffer MR info for write-with-imm destination
const auto& remoteImpl = getImpl(remoteEndpoint);
remoteSignalGpuMrInfo_ = remoteImpl.ibSignalGpuMrInfo_;

// Create a GDR mapping of the local signal GPU buffer. recvThreadFunc reads the
// 64-bit token via localSignalGpuPtr_, which points to the BAR1-mapped host address
// (CUDA/GDRCopy) or the GPU buffer directly (ROCm system-coherent memory).
const auto& localImpl = getImpl(localEndpoint);
if (gdrEnabled() && localImpl.ibSignalGpuBuffer_) {
localSignalGpuMap_ =
std::make_unique<GdrMap>(std::static_pointer_cast<void>(localImpl.ibSignalGpuBuffer_), localGpuDeviceId_);
}
if (localSignalGpuMap_ && localSignalGpuMap_->valid()) {
// Use the BAR1-mapped host pointer; uncacheable MMIO ensures ordered volatile reads.
localSignalGpuPtr_ = localSignalGpuMap_->hostPtr();
} else if (localImpl.ibSignalGpuBuffer_) {
// ROCm: GPU memory is system-coherent, so direct volatile read is safe.
localSignalGpuPtr_ = reinterpret_cast<uint64_t*>(localImpl.ibSignalGpuBuffer_.get());
}

// Pre-post receive requests for incoming write-with-imm
auto qp = qp_.lock();
Expand All @@ -288,22 +338,25 @@ IBConnection::~IBConnection() {
if (recvThread_.joinable()) {
recvThread_.join();
}
if (signalStream_ != nullptr) {
// Synchronize stream to ensure all async copies are complete before destruction
// Ignore errors during teardown (CUDA context may already be destroyed)
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_));
MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_));
}
}
}

Transport IBConnection::transport() const { return transport_; }

Transport IBConnection::remoteTransport() const { return remoteTransport_; }

void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) {
remoteUpdateDstAddr_ = addr;
INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr);
bool IBConnection::usesRecvThread() const { return ibNoAtomic_; }

void IBConnection::setRemoteUpdateDstAddr(std::shared_ptr<uint64_t> gpuMem) {
remoteUpdateDstAddr_ = reinterpret_cast<uint64_t>(gpuMem.get());
if (gdrEnabled()) {
if (gpuMem) {
remoteUpdateDstAddrMap_ = std::make_unique<GdrMap>(std::move(gpuMem), localGpuDeviceId_);
} else {
remoteUpdateDstAddrMap_.reset();
}
}
INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)remoteUpdateDstAddr_);
}

void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
Expand Down Expand Up @@ -356,22 +409,24 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6
*src = newValue;

if (ibNoAtomic_) {
// Use RDMA write-with-imm instead of atomic operation
// Send only newValue in imm_data (0-byte write)
// The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write
// Use RDMA write-with-imm instead of atomic operation.
// Write the token value (8 bytes) from the local host buffer to the remote signal GPU buffer,
// with newValue also in imm_data (32-bit). The remote's recvThreadFunc reads the token from
// the signal GPU buffer and forwards it to the semaphore's inbound token address.

// Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit)
unsigned int immData = static_cast<unsigned int>(newValue);

// Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything)
qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo,
/*size=*/0, /*wrId=*/0,
// Write the real token value into the host buffer, then RDMA write host->remote GPU
*atomicSrc_ = newValue;
qp_.lock()->stageSendWriteWithImm(atomicSrcTransportInfo_.ibMr, remoteSignalGpuMrInfo_,
/*size=*/sizeof(uint64_t), /*wrId=*/0,
/*srcOffset=*/0, /*dstOffset=*/0,
/*signaled=*/true, /*immData=*/immData);
qp_.lock()->postSend();
INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue);
} else {
qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
qp_.lock()->stageSendAtomicAdd(atomicSrcTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue,
/*signaled=*/true);
qp_.lock()->postSend();
INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue,
Expand Down
2 changes: 0 additions & 2 deletions src/core/context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ void CudaIpcStream::sync() {
}
}

Context::Impl::Impl() {}

IbCtx* Context::Impl::getIbContext(Transport ibTransport) {
// Find IB context or create it
auto it = ibContexts_.find(ibTransport);
Expand Down
25 changes: 24 additions & 1 deletion src/core/endpoint.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,23 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)

ibQp_ = contextImpl.getIbContext(config_.transport)
->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum,
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend);
config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend, ibNoAtomic_);
ibQpInfo_ = ibQp_->getInfo();

// Allocate a 64-bit signal GPU buffer for write-with-imm data payload (ibNoAtomic_ only).
if (ibNoAtomic_ && config_.device.type == DeviceType::GPU && config_.device.id >= 0) {
CudaDeviceGuard deviceGuard(config_.device.id);
#if defined(MSCCLPP_DEVICE_HIP)
ibSignalGpuBuffer_ = detail::gpuCallocUncachedShared<uint64_t>();
#else
ibSignalGpuBuffer_ = detail::gpuCallocShared<uint64_t>();
#endif
ibSignalGpuMr_ =
contextImpl.getIbContext(config_.transport)->registerMr(ibSignalGpuBuffer_.get(), sizeof(uint64_t));
ibSignalGpuMrInfo_ = ibSignalGpuMr_->getInfo();
} else {
ibSignalGpuMrInfo_ = {0, 0};
}
} else if (config_.transport == Transport::Ethernet) {
// Configuring Ethernet Interfaces
abortFlag_ = 0;
Expand All @@ -74,6 +89,10 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
if (AllIBTransports.has(config_.transport)) {
ibLocal_ = false;
it = detail::deserialize(it, ibQpInfo_);
it = detail::deserialize(it, ibNoAtomic_);
if (ibNoAtomic_) {
it = detail::deserialize(it, ibSignalGpuMrInfo_);
}
} else if (config_.transport == Transport::Ethernet) {
it = detail::deserialize(it, socketAddress_);
}
Expand Down Expand Up @@ -103,6 +122,10 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() const {
detail::serialize(data, pimpl_->pidHash_);
if (AllIBTransports.has(pimpl_->config_.transport)) {
detail::serialize(data, pimpl_->ibQpInfo_);
detail::serialize(data, pimpl_->ibNoAtomic_);
if (pimpl_->ibNoAtomic_) {
detail::serialize(data, pimpl_->ibSignalGpuMrInfo_);
}
} else if (pimpl_->config_.transport == Transport::Ethernet) {
detail::serialize(data, pimpl_->socketAddress_);
}
Expand Down
Loading
Loading