Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion modules/nvidia_plugin/src/memory_manager/cuda_memory_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

#include <fmt/printf.h>

#include "memory_manager/model/details/cuda_memory_utils.hpp"
#include "model/cuda_memory_model.hpp"
#include "openvino/core/except.hpp"

namespace ov {
namespace nvidia_gpu {
Expand All @@ -29,7 +31,14 @@ MemoryPool::MemoryPool(const size_t num, std::shared_ptr<MemoryModel> memoryMode
}
}

void MemoryPool::Interrupt() { cond_var_.notify_all(); }
void MemoryPool::Interrupt() {
cond_var_.notify_all();
{
std::lock_guard<std::mutex> lock{dyn_mtx_};
interrupted_ = true;
}
dyn_cond_var_.notify_all();
}

MemoryPool::Proxy MemoryPool::WaitAndGet(CancellationToken& cancellationToken) {
std::unique_lock<std::mutex> lock{mtx_};
Expand Down Expand Up @@ -60,5 +69,98 @@ void MemoryPool::PushBack(std::unique_ptr<DeviceMemBlock> memManager) {
cond_var_.notify_one();
}

MemoryPool::DynamicHandle
MemoryPool::AllocateDynamic(size_t bytes, CancellationToken& cancellationToken) {
OPENVINO_ASSERT(bytes > 0, "Dynamic allocation size must be > 0");
const size_t aligned_size = applyAllignment(bytes);

// Fast path: try cudaMalloc directly
try {
auto allocation = CUDA::DefaultStream::stream().malloc(aligned_size);
DynamicChunk chunk{std::move(allocation), aligned_size, 0, aligned_size};
return DynamicHandle{std::move(chunk), shared_from_this()};
} catch (...) {
// cudaMalloc failed — fall through to slow path
}

// Slow path: queue a pending request and wait for a released chunk
std::unique_lock<std::mutex> lock{dyn_mtx_};

OPENVINO_ASSERT(!interrupted_, "MemoryPool was interrupted before dynamic allocation could be queued");

const uint64_t my_id = next_request_id_++;
pending_requests_.push_back(PendingRequest{aligned_size, my_id, false, std::nullopt});

auto cur_req = std::prev(pending_requests_.end());

dyn_cond_var_.wait(lock, [this, &cur_req, &cancellationToken] {
return cur_req->done || interrupted_;
});

if (interrupted_ && !cur_req->done) {
pending_requests_.erase(cur_req);
OPENVINO_THROW("MemoryPool interrupted while waiting for dynamic allocation");
}

DynamicChunk result_chunk = std::move(cur_req->chunk.value());
pending_requests_.erase(cur_req);

return DynamicHandle{std::move(result_chunk), shared_from_this()};
}

void MemoryPool::ReleaseDynamicChunk(DynamicChunk chunk) {
std::lock_guard<std::mutex> lock{dyn_mtx_};

auto head = std::find_if(pending_requests_.begin(), pending_requests_.end(),
[](const PendingRequest& r) { return !r.done; });
if (head == pending_requests_.end()) {
return;
}

const size_t available = chunk.usable_size;
if (head->requested_size <= available) {
head->chunk = DynamicChunk{chunk.allocation, chunk.total_size, chunk.offset, head->requested_size};
head->done = true;

// Try to sub-allocate remaining space for next requests
size_t current_offset = chunk.offset + head->requested_size;
const size_t end_offset = chunk.offset + chunk.usable_size;
for (auto it = std::next(head); it != pending_requests_.end() && current_offset < end_offset; ++it) {
if (it->done) {
continue;
}

const size_t aligned_offset = applyAllignment(current_offset);
if (aligned_offset >= end_offset) {
break;
}

const size_t remaining = end_offset - aligned_offset;
if (it->requested_size > remaining) {
continue;
}

it->chunk = DynamicChunk{chunk.allocation, chunk.total_size, aligned_offset, it->requested_size};
it->done = true;
current_offset = aligned_offset + it->requested_size;
}

dyn_cond_var_.notify_all();
} else {
// Head doesn't fit — free the chunk, then retry cudaMalloc for head
// (freed GPU memory returns to CUDA pool, malloc may now succeed)
{ auto _ = std::move(chunk); }

try {
auto allocation = CUDA::DefaultStream::stream().malloc(head->requested_size);
head->chunk = DynamicChunk{std::move(allocation), head->requested_size, 0, head->requested_size};
head->done = true;
dyn_cond_var_.notify_all();
} catch (...) {
// cudaMalloc still fails — head keeps waiting
}
}
}

} // namespace nvidia_gpu
} // namespace ov
114 changes: 112 additions & 2 deletions modules/nvidia_plugin/src/memory_manager/cuda_memory_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

#include <cancellation_token.hpp>
#include <condition_variable>
#include <deque>
#include <mutex>
#include <optional>

#include "cuda/runtime.hpp"
#include "memory_manager/cuda_memory_manager.hpp"
#include "memory_manager/model/cuda_memory_model.hpp"

Expand All @@ -16,14 +19,72 @@ class MemoryPoolTest;
namespace ov {
namespace nvidia_gpu {

/**
* @brief A single dynamic GPU sub-allocation.
*
* Holds a shared reference to the underlying cudaMalloc allocation
* plus an offset/size describing this consumer's portion.
* Copyable (shared_ptr inside DefaultAllocation), but typically moved.
*/
struct DynamicChunk {
CUDA::DefaultAllocation allocation; ///< shared_ptr-based, ref-counted GPU memory
size_t total_size; ///< total bytes of the underlying cudaMalloc
size_t offset; ///< this consumer's start offset within allocation
size_t usable_size; ///< this consumer's usable byte count

/** Returns device pointer adjusted by offset. */
void* get() const noexcept {
return static_cast<uint8_t*>(allocation.get()) + offset;
}
};

/**
* @brief MemoryPool provides currently available DeviceMemBlock.
*
* This class is an owner of bunch of DeviceMemBlock-s and provides on request
* WaitAndGet currently available DeviceMemBlock from pool
* WaitAndGet currently available DeviceMemBlock from pool.
*
* Additionally supports dynamic GPU memory allocation for operations that
* need memory with sizes unknown at compile time (dynamic shapes).
*/
class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
public:
/**
* @brief RAII handle for a dynamic GPU allocation.
*
* On destruction, returns the chunk back to the MemoryPool so it
* can be recycled to pending requests or freed.
* Move-only.
*/
class DynamicHandle {
public:
DynamicHandle() = default;
DynamicHandle(DynamicHandle&&) = default;
DynamicHandle& operator=(DynamicHandle&&) = default;
DynamicHandle(const DynamicHandle&) = delete;
DynamicHandle& operator=(const DynamicHandle&) = delete;

~DynamicHandle() {
if (pool_) pool_->ReleaseDynamicChunk(std::move(chunk_.value()));
}

/** Device pointer to the allocated region. */
void* get() const noexcept { return chunk_->get(); }

/** Usable size in bytes. */
size_t size() const noexcept { return chunk_->usable_size; }

explicit operator bool() const noexcept { return chunk_.has_value(); }

private:
friend class MemoryPool;
DynamicHandle(DynamicChunk chunk, std::shared_ptr<MemoryPool> pool)
: chunk_{std::move(chunk)}, pool_{std::move(pool)} {}

std::optional<DynamicChunk> chunk_;
std::shared_ptr<MemoryPool> pool_;
};

/**
* @brief Proxy provides currently available DeviceMemBlock.
*
Expand All @@ -48,6 +109,16 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
*/
DeviceMemBlock& Get() { return *memory_block_; }

/**
* Dynamically allocate GPU memory through the pool.
* @param bytes Number of bytes to allocate
* @param cancellationToken Token for cancellation support
* @return RAII handle owning the allocation
*/
DynamicHandle AllocateDynamic(size_t bytes, CancellationToken& cancellationToken) {
return pool_->AllocateDynamic(bytes, cancellationToken);
}

/**
* Initialize Proxy with MemoryPool and DeviceMemBlock.
* MemoryPool is needed for returning back DeviceMemBlock
Expand All @@ -73,7 +144,7 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
MemoryPool(size_t num, std::shared_ptr<MemoryModel> memoryModel);

/**
* Interrupt waiting of DeviceMemBlock Proxy object
* Interrupt waiting of DeviceMemBlock Proxy object and dynamic allocations
*/
void Interrupt();
/**
Expand All @@ -88,15 +159,54 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
private:
friend class ::MemoryPoolTest;

/**
* @brief Internal pending request in the dynamic allocation queue.
*/
struct PendingRequest {
size_t requested_size;
uint64_t request_id;
bool done = false;
std::optional<DynamicChunk> chunk;
};

/**
* Move DeviceMemBlock back to pool
* @param memManager DeviceMemBlock
*/
void PushBack(std::unique_ptr<DeviceMemBlock> memManager);

/**
* Dynamically allocate GPU memory.
*
* Fast path: tries cudaMalloc directly.
* Slow path: if cudaMalloc fails, queues a PendingRequest and waits
* for another thread to release a suitable chunk.
*
* @param bytes Number of bytes to allocate (will be aligned)
* @param cancellationToken Token for cancellation support
* @return RAII handle owning the allocation
* @throws ov::Exception on interruption or cancellation
*/
DynamicHandle AllocateDynamic(size_t bytes, CancellationToken& cancellationToken);

/**
* Return a dynamic chunk back to the pool.
* If there are pending requests that fit, recycles the memory.
* Otherwise frees the GPU memory.
*/
void ReleaseDynamicChunk(DynamicChunk chunk);

// --- Static memory (existing) ---
std::mutex mtx_;
std::condition_variable cond_var_;
std::vector<std::unique_ptr<DeviceMemBlock>> memory_blocks_;

// --- Dynamic memory ---
std::mutex dyn_mtx_;
std::condition_variable dyn_cond_var_;
std::deque<PendingRequest> pending_requests_;
uint64_t next_request_id_{0};
bool interrupted_{false};
};

} // namespace nvidia_gpu
Expand Down
Loading