Implementing custom CUDA memory allocator #262

VitalyVaryvdin · 2025-08-12T11:00:13Z

VitalyVaryvdin
Aug 12, 2025

I'd like to implement pool based CUDA memory allocation
Can you give me any guidance?

kgnandanwar · 2026-01-19T23:50:51Z

kgnandanwar
Jan 19, 2026

Implementing Custom CUDA Memory Allocator in CV-CUDA

Problem

You want to implement a pool-based CUDA memory allocator for use with CV-CUDA.

Solution

Create a custom allocator class that manages a pool of CUDA device memory blocks. Integrate it with CV-CUDA by passing your allocator to relevant APIs or using it to allocate memory for external memory imports.

C++ Example

// Simple pool-based CUDA allocator skeleton
class CudaPoolAllocator {
public:
    CudaPoolAllocator(size_t blockSize, int poolSize);
    void* allocate();
    void deallocate(void* ptr);
    ~CudaPoolAllocator();
private:
    std::vector<void*> pool_;
    // ...existing code...
};

// Usage with CV-CUDA external memory
void* cuda_ptr = allocator.allocate();
nvcv::ExternalMemory extMem(cuda_ptr, size, pitch, ...);
nvcv::Image img = nvcv::Image::Create(extMem, width, height, format);

Key API References

src/nvcv/ExternalMemory.cpp: shows how external memory is wrapped
include/nvcv/Image.hpp: image creation from external memory
src/nvcv/allocators/: check for existing allocator patterns

Notes

Use cudaMalloc/cudaFree for device memory management.
Pool allocators pre-allocate blocks and reuse them, reducing allocation overhead.
Integrate with CV-CUDA by wrapping allocated pointers as external memory.
Ensure thread safety if used in multi-threaded contexts.

Complete Working Example

Below is a tested, working implementation of a pool-based CUDA allocator that demonstrates the concept. In production, you would use nvcvImageWrapDataConstruct() to wrap the pooled memory into actual CV-CUDA images.

File: custom_cuda_allocator_example.cpp

#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdexcept>

// Simple pool-based CUDA allocator
class CudaPoolAllocator {
public:
    CudaPoolAllocator(size_t blockSize, int poolSize) 
        : blockSize_(blockSize), totalBlocks_(poolSize) {
        std::cout << "Creating pool with " << poolSize 
                  << " blocks of " << blockSize << " bytes each\n";
        
        for (int i = 0; i < poolSize; ++i) {
            void* ptr = nullptr;
            cudaError_t err = cudaMalloc(&ptr, blockSize_);
            if (err != cudaSuccess) {
                cleanup();
                throw std::runtime_error(std::string("cudaMalloc failed: ") 
                    + cudaGetErrorString(err));
            }
            pool_.push_back(ptr);
            std::cout << "  Allocated block " << i << " at " << ptr << "\n";
        }
    }

    void* allocate() {
        if (pool_.empty()) {
            throw std::runtime_error("Pool exhausted! All " 
                + std::to_string(totalBlocks_) + " blocks are in use.");
        }
        void* ptr = pool_.back();
        pool_.pop_back();
        std::cout << "Allocated from pool: " << ptr 
                  << " (" << pool_.size() << " blocks remaining)\n";
        return ptr;
    }

    void deallocate(void* ptr) {
        pool_.push_back(ptr);
        std::cout << "Returned to pool: " << ptr 
                  << " (" << pool_.size() << " blocks available)\n";
    }

    size_t getBlockSize() const { return blockSize_; }
    int getTotalBlocks() const { return totalBlocks_; }
    int getAvailableBlocks() const { return pool_.size(); }

    ~CudaPoolAllocator() {
        cleanup();
    }

private:
    void cleanup() {
        std::cout << "Cleaning up pool (" << pool_.size() << " blocks to free)...\n";
        for (void* ptr : pool_) {
            cudaFree(ptr);
        }
        pool_.clear();
    }

    size_t blockSize_;
    int totalBlocks_;
    std::vector<void*> pool_;
};

// Simulated CV-CUDA Image structure (for demonstration)
struct SimulatedImageWrapper {
    void* devicePtr;
    size_t width;
    size_t height;
    size_t channels;
    size_t rowStride;
    CudaPoolAllocator* allocator;

    SimulatedImageWrapper(void* ptr, size_t w, size_t h, size_t c, size_t stride, CudaPoolAllocator* alloc)
        : devicePtr(ptr), width(w), height(h), channels(c), rowStride(stride), allocator(alloc) {}

    ~SimulatedImageWrapper() {
        if (allocator && devicePtr) {
            allocator->deallocate(devicePtr);
        }
    }
};

int main() {
    try {
        std::cout << "=== Custom CUDA Pool Allocator Example ===\n\n";

        // Check CUDA device
        int deviceCount = 0;
        cudaError_t err = cudaGetDeviceCount(&deviceCount);
        if (err != cudaSuccess || deviceCount == 0) {
            throw std::runtime_error("No CUDA device found");
        }
        std::cout << "CUDA devices found: " << deviceCount << "\n\n";

        // Image dimensions
        const int width = 1920;
        const int height = 1080;
        const int channels = 3;
        
        // Calculate buffer size (simple RGB8 packed format)
        const size_t rowStride = width * channels;
        const size_t blockSize = rowStride * height;

        std::cout << "Image: " << width << "x" << height 
                  << " RGB8 (" << blockSize << " bytes = " 
                  << (blockSize / 1024.0 / 1024.0) << " MB)\n\n";

        // Create pool allocator with 4 blocks
        CudaPoolAllocator allocator(blockSize, 4);
        std::cout << "\n";

        // Demonstrate allocating and using multiple blocks
        std::cout << "--- Demonstrating pool usage ---\n";
        std::vector<SimulatedImageWrapper*> images;

        for (int i = 0; i < 3; ++i) {
            std::cout << "\nCreating image " << (i + 1) << "...\n";
            void* cuda_ptr = allocator.allocate();
            
            // In real CV-CUDA, you would call:
            // nvcvImageWrapDataConstruct(&imageData, &cleanup, &allocator, &handle);
            // Here we simulate wrapping the memory
            auto* img = new SimulatedImageWrapper(cuda_ptr, width, height, channels, rowStride, &allocator);
            images.push_back(img);
            
            std::cout << "  Created image wrapper at device ptr: " << img->devicePtr << "\n";
            std::cout << "  Pool status: " << allocator.getAvailableBlocks() << "/" 
                      << allocator.getTotalBlocks() << " blocks available\n";
        }

        std::cout << "\n--- Releasing images back to pool ---\n";
        for (size_t i = 0; i < images.size(); ++i) {
            std::cout << "\nReleasing image " << (i + 1) << "...\n";
            delete images[i];
            std::cout << "  Pool status: " << allocator.getAvailableBlocks() << "/" 
                      << allocator.getTotalBlocks() << " blocks available\n";
        }
        images.clear();

        std::cout << "\n=== Example completed successfully! ===\n";
        std::cout << "✓ Memory was allocated from pool\n";
        std::cout << "✓ Used in zero-copy fashion (wrapped existing CUDA pointers)\n";
        std::cout << "✓ Returned to pool for reuse when images were destroyed\n";
        std::cout << "\nIn production with CV-CUDA, use nvcvImageWrapDataConstruct() to\n";
        std::cout << "wrap your pooled CUDA memory into nvcv::Image objects.\n";

        return 0;

    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
}

How to Build and Run

In WSL (Linux):

# Compile with nvcc
nvcc -o custom_cuda_allocator_example custom_cuda_allocator_example.cpp -lcudart

# Run the example
./custom_cuda_allocator_example

Expected Output:

=== Custom CUDA Pool Allocator Example ===

CUDA devices found: 1

Image: 1920x1080 RGB8 (6220800 bytes = 5.93262 MB)

Creating pool with 4 blocks of 6220800 bytes each
  Allocated block 0 at 0x504c00000
  Allocated block 1 at 0x505200000
  Allocated block 2 at 0x505800000
  Allocated block 3 at 0x505e00000

--- Demonstrating pool usage ---

Creating image 1...
Allocated from pool: 0x505e00000 (3 blocks remaining)
  Created image wrapper at device ptr: 0x505e00000
  Pool status: 3/4 blocks available

[... more images ...]

=== Example completed successfully! ===
✓ Memory was allocated from pool
✓ Used in zero-copy fashion (wrapped existing CUDA pointers)
✓ Returned to pool for reuse when images were destroyed

Integration with CV-CUDA

To use this with actual CV-CUDA, replace the simulated wrapper with:

#include <nvcv/Image.h>

// Cleanup callback
void cudaMemoryCleanup(void* ctx, const NVCVImageData* data) {
    CudaPoolAllocator* allocator = static_cast<CudaPoolAllocator*>(ctx);
    if (allocator && data->bufferType == NVCV_IMAGE_BUFFER_STRIDED_CUDA) {
        void* ptr = data->buffer.strided.planes[0].basePtr;
        allocator->deallocate(ptr);
    }
}

// Create CV-CUDA image from pooled memory
void* cuda_ptr = allocator.allocate();

NVCVImageData imageData = {};
imageData.format = NVCV_IMAGE_FORMAT_RGB8;
imageData.bufferType = NVCV_IMAGE_BUFFER_STRIDED_CUDA;
imageData.buffer.strided.numPlanes = 1;
imageData.buffer.strided.planes[0].width = width;
imageData.buffer.strided.planes[0].height = height;
imageData.buffer.strided.planes[0].rowStride = rowStride;
imageData.buffer.strided.planes[0].basePtr = static_cast<NVCVByte*>(cuda_ptr);

NVCVImageHandle imageHandle = nullptr;
nvcvImageWrapDataConstruct(&imageData, &cudaMemoryCleanup, &allocator, &imageHandle);

// Use the image...
// When done: nvcvImageDecRef(imageHandle, nullptr);

Summary

This example demonstrates a working pool-based CUDA allocator that pre-allocates device memory and efficiently reuses it. The memory is wrapped in zero-copy fashion, avoiding expensive memory allocation/deallocation overhead. Use nvcvImageWrapDataConstruct() to integrate with CV-CUDA.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implementing custom CUDA memory allocator #262

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Implementing custom CUDA memory allocator #262

Uh oh!

VitalyVaryvdin Aug 12, 2025

Replies: 1 comment

Uh oh!

kgnandanwar Jan 19, 2026

Implementing Custom CUDA Memory Allocator in CV-CUDA

Problem

Solution

C++ Example

Key API References

Notes

Complete Working Example

How to Build and Run

Integration with CV-CUDA

Summary

VitalyVaryvdin
Aug 12, 2025

kgnandanwar
Jan 19, 2026