openvinotoolkit · redradist · Feb 18, 2026 · Feb 18, 2026
@@ -6,7 +6,9 @@
 
 #include <fmt/printf.h>
 
+#include "memory_manager/model/details/cuda_memory_utils.hpp"
 #include "model/cuda_memory_model.hpp"
+#include "openvino/core/except.hpp"
 
 namespace ov {
 namespace nvidia_gpu {
@@ -29,7 +31,14 @@ MemoryPool::MemoryPool(const size_t num, std::shared_ptr<MemoryModel> memoryMode
     }
 }
 
-void MemoryPool::Interrupt() { cond_var_.notify_all(); }
+void MemoryPool::Interrupt() {
+    cond_var_.notify_all();
+    {
+        std::lock_guard<std::mutex> lock{dyn_mtx_};
+        interrupted_ = true;
+    }
+    dyn_cond_var_.notify_all();
+}
 
 MemoryPool::Proxy MemoryPool::WaitAndGet(CancellationToken& cancellationToken) {
     std::unique_lock<std::mutex> lock{mtx_};
@@ -60,5 +69,98 @@ void MemoryPool::PushBack(std::unique_ptr<DeviceMemBlock> memManager) {
     cond_var_.notify_one();
 }
 
+MemoryPool::DynamicHandle 
+MemoryPool::AllocateDynamic(size_t bytes, CancellationToken& cancellationToken) {
+    OPENVINO_ASSERT(bytes > 0, "Dynamic allocation size must be > 0");
+    const size_t aligned_size = applyAllignment(bytes);
+
+    // Fast path: try cudaMalloc directly
+    try {
+        auto allocation = CUDA::DefaultStream::stream().malloc(aligned_size);
+        DynamicChunk chunk{std::move(allocation), aligned_size, 0, aligned_size};
+        return DynamicHandle{std::move(chunk), shared_from_this()};
+    } catch (...) {
+        // cudaMalloc failed — fall through to slow path
+    }
+
+    // Slow path: queue a pending request and wait for a released chunk
+    std::unique_lock<std::mutex> lock{dyn_mtx_};
+
+    OPENVINO_ASSERT(!interrupted_, "MemoryPool was interrupted before dynamic allocation could be queued");
+
+    const uint64_t my_id = next_request_id_++;
+    pending_requests_.push_back(PendingRequest{aligned_size, my_id, false, std::nullopt});
+
+    auto cur_req = std::prev(pending_requests_.end());
+
+    dyn_cond_var_.wait(lock, [this, &cur_req, &cancellationToken] {
+        return cur_req->done || interrupted_;
+    });
+
+    if (interrupted_ && !cur_req->done) {
+        pending_requests_.erase(cur_req);
+        OPENVINO_THROW("MemoryPool interrupted while waiting for dynamic allocation");
+    }
+
+    DynamicChunk result_chunk = std::move(cur_req->chunk.value());
+    pending_requests_.erase(cur_req);
+
+    return DynamicHandle{std::move(result_chunk), shared_from_this()};
+}
+
+void MemoryPool::ReleaseDynamicChunk(DynamicChunk chunk) {
+    std::lock_guard<std::mutex> lock{dyn_mtx_};
+
+    auto head = std::find_if(pending_requests_.begin(), pending_requests_.end(),
+                             [](const PendingRequest& r) { return !r.done; });
+    if (head == pending_requests_.end()) {
+        return;
+    }
+
+    const size_t available = chunk.usable_size;
+    if (head->requested_size <= available) {
+        head->chunk = DynamicChunk{chunk.allocation, chunk.total_size, chunk.offset, head->requested_size};
+        head->done = true;
+
+        // Try to sub-allocate remaining space for next requests
+        size_t current_offset = chunk.offset + head->requested_size;
+        const size_t end_offset = chunk.offset + chunk.usable_size;
+        for (auto it = std::next(head); it != pending_requests_.end() && current_offset < end_offset; ++it) {
+            if (it->done) {
+                continue;
+            }
+
+            const size_t aligned_offset = applyAllignment(current_offset);
+            if (aligned_offset >= end_offset) {
+                break;
+            }
+
+            const size_t remaining = end_offset - aligned_offset;
+            if (it->requested_size > remaining) {
+                continue;
+            }
+
+            it->chunk = DynamicChunk{chunk.allocation, chunk.total_size, aligned_offset, it->requested_size};
+            it->done = true;
+            current_offset = aligned_offset + it->requested_size;
+        }
+
+        dyn_cond_var_.notify_all();
+    } else {
+        // Head doesn't fit — free the chunk, then retry cudaMalloc for head
+        // (freed GPU memory returns to CUDA pool, malloc may now succeed)
+        { auto _ = std::move(chunk); }
+
+        try {
+            auto allocation = CUDA::DefaultStream::stream().malloc(head->requested_size);
+            head->chunk = DynamicChunk{std::move(allocation), head->requested_size, 0, head->requested_size};
+            head->done = true;
+            dyn_cond_var_.notify_all();
+        } catch (...) {
+            // cudaMalloc still fails — head keeps waiting
+        }
+    }
+}
+
 }  // namespace nvidia_gpu
 }  // namespace ov
@@ -6,8 +6,11 @@
 
 #include <cancellation_token.hpp>
 #include <condition_variable>
+#include <deque>
 #include <mutex>
+#include <optional>
 
+#include "cuda/runtime.hpp"
 #include "memory_manager/cuda_memory_manager.hpp"
 #include "memory_manager/model/cuda_memory_model.hpp"
 
@@ -16,14 +19,72 @@ class MemoryPoolTest;
 namespace ov {
 namespace nvidia_gpu {
 
+/**
+ * @brief A single dynamic GPU sub-allocation.
+ *
+ * Holds a shared reference to the underlying cudaMalloc allocation
+ * plus an offset/size describing this consumer's portion.
+ * Copyable (shared_ptr inside DefaultAllocation), but typically moved.
+ */
+struct DynamicChunk {
+    CUDA::DefaultAllocation allocation;  ///< shared_ptr-based, ref-counted GPU memory
+    size_t total_size;                   ///< total bytes of the underlying cudaMalloc
+    size_t offset;                       ///< this consumer's start offset within allocation
+    size_t usable_size;                  ///< this consumer's usable byte count
+
+    /** Returns device pointer adjusted by offset. */
+    void* get() const noexcept {
+        return static_cast<uint8_t*>(allocation.get()) + offset;
+    }
+};
+
 /**
  * @brief MemoryPool provides currently available DeviceMemBlock.
  *
  * This class is an owner of bunch of DeviceMemBlock-s and provides on request
- * WaitAndGet currently available DeviceMemBlock from pool
+ * WaitAndGet currently available DeviceMemBlock from pool.
+ *
+ * Additionally supports dynamic GPU memory allocation for operations that
+ * need memory with sizes unknown at compile time (dynamic shapes).
  */
 class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
 public:
+    /**
+     * @brief RAII handle for a dynamic GPU allocation.
+     *
+     * On destruction, returns the chunk back to the MemoryPool so it
+     * can be recycled to pending requests or freed.
+     * Move-only.
+     */
+    class DynamicHandle {
+    public:
+        DynamicHandle() = default;
+        DynamicHandle(DynamicHandle&&) = default;
+        DynamicHandle& operator=(DynamicHandle&&) = default;
+        DynamicHandle(const DynamicHandle&) = delete;
+        DynamicHandle& operator=(const DynamicHandle&) = delete;
+
+        ~DynamicHandle() {
+            if (pool_) pool_->ReleaseDynamicChunk(std::move(chunk_.value()));
+        }
+
+        /** Device pointer to the allocated region. */
+        void* get() const noexcept { return chunk_->get(); }
+
+        /** Usable size in bytes. */
+        size_t size() const noexcept { return chunk_->usable_size; }
+
+        explicit operator bool() const noexcept { return chunk_.has_value(); }
+
+    private:
+        friend class MemoryPool;
+        DynamicHandle(DynamicChunk chunk, std::shared_ptr<MemoryPool> pool)
+            : chunk_{std::move(chunk)}, pool_{std::move(pool)} {}
+
+        std::optional<DynamicChunk> chunk_;
+        std::shared_ptr<MemoryPool> pool_;
+    };
+
     /**
      * @brief Proxy provides currently available DeviceMemBlock.
      *
@@ -48,6 +109,16 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
          */
         DeviceMemBlock& Get() { return *memory_block_; }
 
+        /**
+         * Dynamically allocate GPU memory through the pool.
+         * @param bytes Number of bytes to allocate
+         * @param cancellationToken Token for cancellation support
+         * @return RAII handle owning the allocation
+         */
+        DynamicHandle AllocateDynamic(size_t bytes, CancellationToken& cancellationToken) {
+            return pool_->AllocateDynamic(bytes, cancellationToken);
+        }
+
         /**
          * Initialize Proxy with MemoryPool and DeviceMemBlock.
          * MemoryPool is needed for returning back DeviceMemBlock
@@ -73,7 +144,7 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
     MemoryPool(size_t num, std::shared_ptr<MemoryModel> memoryModel);
 
     /**
-     * Interrupt waiting of DeviceMemBlock Proxy object
+     * Interrupt waiting of DeviceMemBlock Proxy object and dynamic allocations
      */
     void Interrupt();
     /**
@@ -88,15 +159,54 @@ class MemoryPool : public std::enable_shared_from_this<MemoryPool> {
 private:
     friend class ::MemoryPoolTest;
 
+    /**
+     * @brief Internal pending request in the dynamic allocation queue.
+     */
+    struct PendingRequest {
+        size_t requested_size;
+        uint64_t request_id;
+        bool done = false;
+        std::optional<DynamicChunk> chunk;
+    };
+
     /**
      * Move DeviceMemBlock back to pool
      * @param memManager DeviceMemBlock
      */
     void PushBack(std::unique_ptr<DeviceMemBlock> memManager);
 
+    /**
+     * Dynamically allocate GPU memory.
+     *
+     * Fast path: tries cudaMalloc directly.
+     * Slow path: if cudaMalloc fails, queues a PendingRequest and waits
+     * for another thread to release a suitable chunk.
+     *
+     * @param bytes Number of bytes to allocate (will be aligned)
+     * @param cancellationToken Token for cancellation support
+     * @return RAII handle owning the allocation
+     * @throws ov::Exception on interruption or cancellation
+     */
+    DynamicHandle AllocateDynamic(size_t bytes, CancellationToken& cancellationToken);
+
+    /**
+     * Return a dynamic chunk back to the pool.
+     * If there are pending requests that fit, recycles the memory.
+     * Otherwise frees the GPU memory.
+     */
+    void ReleaseDynamicChunk(DynamicChunk chunk);
+
+    // --- Static memory (existing) ---
     std::mutex mtx_;
     std::condition_variable cond_var_;
     std::vector<std::unique_ptr<DeviceMemBlock>> memory_blocks_;
+
+    // --- Dynamic memory ---
+    std::mutex dyn_mtx_;
+    std::condition_variable dyn_cond_var_;
+    std::deque<PendingRequest> pending_requests_;
+    uint64_t next_request_id_{0};
+    bool interrupted_{false};
 };
 
 }  // namespace nvidia_gpu