Skip to content

Commit 1739f5a

Browse files
committed
WIP
1 parent 70c1d4d commit 1739f5a

1 file changed

Lines changed: 9 additions & 3 deletions

File tree

src/registered_memory.cc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,20 @@ std::shared_ptr<void> getPeerMemoryHandle(cudaIpcMemHandle_t ipcHandle) {
6969
};
7070
#if defined(__HIP_PLATFORM_AMD__)
7171
static std::unordered_map<cudaIpcMemHandle_t, std::weak_ptr<void>> peerMemoryHandleMap;
72-
std::mutex mutex;
72+
static std::mutex mutex;
7373
std::lock_guard<std::mutex> lock(mutex);
7474
auto it = peerMemoryHandleMap.find(ipcHandle);
7575
if (it != peerMemoryHandleMap.end()) {
7676
if (auto ptr = it->second.lock()) {
7777
return ptr;
7878
}
79-
throw mscclpp::Error("Failed to get peer memory handle, may already be closed", mscclpp::ErrorCode::InvalidUsage);
8079
}
8180
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&addr, ipcHandle, cudaIpcMemLazyEnablePeerAccess));
82-
std::shared_ptr<void> ptr = std::shared_ptr<void>(addr, deleter);
81+
std::shared_ptr<void> ptr = std::shared_ptr<void>(addr, [ipcHandle, deleter](void* p) {
82+
deleter(p);
83+
std::lock_guard<std::mutex> lock(mutex);
84+
peerMemoryHandleMap.erase(ipcHandle);
85+
});
8386
peerMemoryHandleMap[ipcHandle] = ptr;
8487
return ptr;
8588
#else
@@ -304,6 +307,9 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
304307
#endif // !(CUDA_NVLS_API_AVAILABLE)
305308
} else if (getHostHash() == this->hostHash) {
306309
this->peerHandle = getPeerMemoryHandle(entry.cudaIpcBaseHandle);
310+
if (!this->peerHandle) {
311+
throw Error("Failed to open CUDA IPC handle, may already be closed", ErrorCode::InvalidUsage);
312+
}
307313
this->data = static_cast<char*>(this->peerHandle.get()) + entry.cudaIpcOffsetFromBase;
308314
}
309315
}

0 commit comments

Comments
 (0)