Skip to content
This repository was archived by the owner on Jun 10, 2024. It is now read-only.

Commit 5742166

Browse files
committed
Merge branch 'on_gpu_buffer'
2 parents bbd8baa + dc64956 commit 5742166

File tree

6 files changed

+392
-3
lines changed

6 files changed

+392
-3
lines changed

PyNvCodec/TC/inc/MemoryInterfaces.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,36 @@ class DllExport Buffer final : public Token {
104104
#endif
105105
};
106106

107+
class DllExport CudaBuffer final : public Token {
108+
public:
109+
CudaBuffer() = delete;
110+
CudaBuffer(const CudaBuffer &other) = delete;
111+
CudaBuffer &operator=(CudaBuffer &other) = delete;
112+
113+
static CudaBuffer *Make(size_t elemSize, size_t numElems, CUcontext context);
114+
CudaBuffer *Clone();
115+
116+
size_t GetRawMemSize() const { return elem_size * num_elems; }
117+
size_t GetNumElems() const { return num_elems; }
118+
size_t GetElemSize() const { return elem_size; }
119+
CUdeviceptr GpuMem() { return gpuMem; }
120+
~CudaBuffer();
121+
122+
private:
123+
CudaBuffer(size_t elemSize, size_t numElems, CUcontext context);
124+
bool Allocate();
125+
void Deallocate();
126+
127+
CUdeviceptr gpuMem = 0UL;
128+
CUcontext ctx = nullptr;
129+
size_t elem_size = 0U;
130+
size_t num_elems = 0U;
131+
132+
#ifdef TRACK_TOKEN_ALLOCATIONS
133+
uint64_t id = 0U;
134+
#endif
135+
};
136+
107137
/* RAII-style CUDA Context (un)lock;
108138
*/
109139
class DllExport CudaCtxPush final {

PyNvCodec/TC/inc/Tasks.hpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,26 @@ class DllExport CudaUploadFrame final : public Task {
143143
struct CudaUploadFrame_Impl *pImpl = nullptr;
144144
};
145145

146+
class DllExport UploadBuffer final : public Task {
147+
public:
148+
UploadBuffer() = delete;
149+
UploadBuffer(const UploadBuffer &other) = delete;
150+
UploadBuffer &operator=(const UploadBuffer &other) = delete;
151+
152+
TaskExecStatus Run() final;
153+
size_t GetUploadSize() const;
154+
~UploadBuffer() final;
155+
static UploadBuffer *Make(CUstream cuStream, CUcontext cuContext,
156+
uint32_t elem_size, uint32_t num_elems);
157+
158+
private:
159+
UploadBuffer(CUstream cuStream, CUcontext cuContext,
160+
uint32_t elem_size, uint32_t num_elems);
161+
static const uint32_t numInputs = 1U;
162+
static const uint32_t numOutputs = 1U;
163+
struct UploadBuffer_Impl *pImpl = nullptr;
164+
};
165+
146166
class DllExport CudaDownloadSurface final : public Task {
147167
public:
148168
CudaDownloadSurface() = delete;
@@ -163,6 +183,25 @@ class DllExport CudaDownloadSurface final : public Task {
163183
struct CudaDownloadSurface_Impl *pImpl = nullptr;
164184
};
165185

186+
class DllExport DownloadCudaBuffer final : public Task {
187+
public:
188+
DownloadCudaBuffer() = delete;
189+
DownloadCudaBuffer(const DownloadCudaBuffer &other) = delete;
190+
DownloadCudaBuffer &operator=(const DownloadCudaBuffer &other) = delete;
191+
192+
~DownloadCudaBuffer() final;
193+
TaskExecStatus Run() final;
194+
static DownloadCudaBuffer *Make(CUstream cuStream, CUcontext cuContext,
195+
uint32_t elem_size, uint32_t num_elems);
196+
197+
private:
198+
DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext,
199+
uint32_t elem_size, uint32_t num_elems);
200+
static const uint32_t numInputs = 1U;
201+
static const uint32_t numOutputs = 1U;
202+
struct DownloadCudaBuffer_Impl *pImpl = nullptr;
203+
};
204+
166205
class DllExport DemuxFrame final : public Task {
167206
public:
168207
DemuxFrame() = delete;

PyNvCodec/TC/src/MemoryInterfaces.cpp

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,12 @@ struct AllocRegister {
7979
}
8080
};
8181

82-
AllocRegister BuffersRegister, HWSurfaceRegister;
82+
AllocRegister BuffersRegister, HWSurfaceRegister, CudaBuffersRegiser;
8383

8484
bool CheckAllocationCounters() {
8585
auto numLeakedBuffers = BuffersRegister.GetSize();
8686
auto numLeakedSurfaces = HWSurfaceRegister.GetSize();
87+
auto numLeakedCudaBuffers = CudaBuffersRegiser.GetSize();
8788

8889
if (numLeakedBuffers) {
8990
cerr << "Leaked buffers (id : size): " << endl;
@@ -101,7 +102,15 @@ bool CheckAllocationCounters() {
101102
}
102103
}
103104

104-
return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces);
105+
if (numLeakedCudaBuffers) {
106+
cerr << "Leaked CUDA buffers (id : size): " << endl;
107+
for (auto i = 0; i < numLeakedCudaBuffers; i++) {
108+
auto pNote = CudaBuffersRegiser.GetNoteByIndex(i);
109+
cerr << "\t" << pNote->id << "\t: " << pNote->size << endl;
110+
}
111+
}
112+
113+
return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces) && (0U == numLeakedCudaBuffers);
105114
}
106115

107116
} // namespace VPF
@@ -263,6 +272,61 @@ Buffer *Buffer::MakeOwnMem(size_t bufferSize, const void *pCopyFrom,
263272
return new Buffer(bufferSize, pCopyFrom, ctx);
264273
}
265274

275+
CudaBuffer* CudaBuffer::Make(size_t elemSize, size_t numElems, CUcontext context) {
276+
return new CudaBuffer(elemSize, numElems, context);
277+
}
278+
279+
CudaBuffer *CudaBuffer::Clone() {
280+
auto pCopy = CudaBuffer::Make(elem_size, num_elems, ctx);
281+
282+
if (CUDA_SUCCESS != cuMemcpyDtoD(pCopy->GpuMem(), GpuMem(), GetRawMemSize())) {
283+
delete pCopy;
284+
return nullptr;
285+
}
286+
287+
return pCopy;
288+
}
289+
290+
CudaBuffer::~CudaBuffer() {
291+
Deallocate();
292+
}
293+
294+
CudaBuffer::CudaBuffer(size_t elemSize, size_t numElems, CUcontext context) {
295+
elem_size = elemSize;
296+
num_elems = numElems;
297+
ctx = context;
298+
299+
if (!Allocate()) {
300+
throw bad_alloc();
301+
}
302+
}
303+
304+
bool CudaBuffer::Allocate() {
305+
if (GetRawMemSize()) {
306+
CudaCtxPush lock(ctx);
307+
auto res = cuMemAlloc(&gpuMem, GetRawMemSize());
308+
ThrowOnCudaError(res, __LINE__);
309+
310+
if (0U != gpuMem) {
311+
#ifdef TRACK_TOKEN_ALLOCATIONS
312+
id = CudaBuffersRegiser.AddNote(GetRawMemSize());
313+
#endif
314+
return true;
315+
}
316+
}
317+
return false;
318+
}
319+
320+
void CudaBuffer::Deallocate() {
321+
ThrowOnCudaError(cuMemFree(gpuMem), __LINE__);
322+
gpuMem = 0U;
323+
324+
#ifdef TRACK_TOKEN_ALLOCATIONS
325+
AllocInfo info(id, GetRawMemSize());
326+
CudaBuffersRegiser.DeleteNote(info);
327+
#endif
328+
}
329+
266330
SurfacePlane::SurfacePlane() = default;
267331

268332
SurfacePlane &SurfacePlane::operator=(const SurfacePlane &other) {

PyNvCodec/TC/src/Tasks.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,65 @@ TaskExecStatus CudaUploadFrame::Run() {
500500
return TASK_EXEC_SUCCESS;
501501
}
502502

503+
namespace VPF {
504+
struct UploadBuffer_Impl {
505+
CUstream cuStream;
506+
CUcontext cuContext;
507+
CudaBuffer *pBuffer = nullptr;
508+
509+
UploadBuffer_Impl() = delete;
510+
UploadBuffer_Impl(const UploadBuffer_Impl &other) = delete;
511+
UploadBuffer_Impl &operator=(const UploadBuffer_Impl &other) = delete;
512+
513+
UploadBuffer_Impl(CUstream stream, CUcontext context,
514+
uint32_t elem_size, uint32_t num_elems)
515+
: cuStream(stream), cuContext(context) {
516+
pBuffer = CudaBuffer::Make(elem_size, num_elems, context);
517+
}
518+
519+
~UploadBuffer_Impl() { delete pBuffer; }
520+
};
521+
} // namespace VPF
522+
523+
UploadBuffer *UploadBuffer::Make(CUstream cuStream, CUcontext cuContext,
524+
uint32_t elem_size, uint32_t num_elems) {
525+
return new UploadBuffer(cuStream, cuContext, elem_size, num_elems);
526+
}
527+
528+
UploadBuffer::UploadBuffer(CUstream cuStream, CUcontext cuContext,
529+
uint32_t elem_size, uint32_t num_elems)
530+
:
531+
532+
Task("UploadBuffer", UploadBuffer::numInputs,
533+
UploadBuffer::numOutputs, cuda_stream_sync, (void *)cuStream) {
534+
pImpl = new UploadBuffer_Impl(cuStream, cuContext, elem_size, num_elems);
535+
}
536+
537+
UploadBuffer::~UploadBuffer() { delete pImpl; }
538+
539+
TaskExecStatus UploadBuffer::Run() {
540+
NvtxMark tick(__FUNCTION__);
541+
if (!GetInput()) {
542+
return TASK_EXEC_FAIL;
543+
}
544+
545+
ClearOutputs();
546+
547+
auto stream = pImpl->cuStream;
548+
auto context = pImpl->cuContext;
549+
auto pBuffer = pImpl->pBuffer;
550+
auto pSrcHost = ((Buffer *)GetInput())->GetDataAs<void>();
551+
552+
CudaCtxPush lock(context);
553+
if (CUDA_SUCCESS != cuMemcpyHtoDAsync(pBuffer->GpuMem(), (const void *)pSrcHost,
554+
pBuffer->GetRawMemSize(), stream)) {
555+
return TASK_EXEC_FAIL;
556+
}
557+
558+
SetOutput(pBuffer, 0);
559+
return TASK_EXEC_SUCCESS;
560+
}
561+
503562
namespace VPF {
504563
struct CudaDownloadSurface_Impl {
505564
CUstream cuStream;
@@ -538,6 +597,25 @@ struct CudaDownloadSurface_Impl {
538597

539598
~CudaDownloadSurface_Impl() { delete pHostFrame; }
540599
};
600+
601+
struct DownloadCudaBuffer_Impl {
602+
CUstream cuStream;
603+
CUcontext cuContext;
604+
Buffer *pHostBuffer = nullptr;
605+
606+
DownloadCudaBuffer_Impl() = delete;
607+
DownloadCudaBuffer_Impl(const DownloadCudaBuffer_Impl &other) = delete;
608+
DownloadCudaBuffer_Impl &
609+
operator=(const DownloadCudaBuffer_Impl &other) = delete;
610+
611+
DownloadCudaBuffer_Impl(CUstream stream, CUcontext context, uint32_t elem_size,
612+
uint32_t num_elems)
613+
: cuStream(stream), cuContext(context) {
614+
pHostBuffer = Buffer::MakeOwnMem(elem_size * num_elems, context);
615+
}
616+
617+
~DownloadCudaBuffer_Impl() { delete pHostBuffer; }
618+
};
541619
} // namespace VPF
542620

543621
CudaDownloadSurface *CudaDownloadSurface::Make(CUstream cuStream,
@@ -601,6 +679,45 @@ TaskExecStatus CudaDownloadSurface::Run() {
601679
return TASK_EXEC_SUCCESS;
602680
}
603681

682+
DownloadCudaBuffer *DownloadCudaBuffer::Make(CUstream cuStream, CUcontext cuContext,
683+
uint32_t elem_size, uint32_t num_elems) {
684+
return new DownloadCudaBuffer(cuStream, cuContext, elem_size, num_elems);
685+
}
686+
687+
DownloadCudaBuffer::DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext,
688+
uint32_t elem_size, uint32_t num_elems) :
689+
Task("DownloadCudaBuffer", DownloadCudaBuffer::numInputs,
690+
DownloadCudaBuffer::numOutputs, cuda_stream_sync,
691+
(void *)cuStream) {
692+
pImpl = new DownloadCudaBuffer_Impl(cuStream, cuContext, elem_size, num_elems);
693+
}
694+
695+
DownloadCudaBuffer::~DownloadCudaBuffer() { delete pImpl; }
696+
697+
TaskExecStatus DownloadCudaBuffer::Run() {
698+
NvtxMark tick(__FUNCTION__);
699+
700+
if (!GetInput()) {
701+
return TASK_EXEC_FAIL;
702+
}
703+
704+
ClearOutputs();
705+
706+
auto stream = pImpl->cuStream;
707+
auto context = pImpl->cuContext;
708+
auto pCudaBuffer = (CudaBuffer *)GetInput();
709+
auto pDstHost = ((Buffer *)pImpl->pHostBuffer)->GetDataAs<void>();
710+
711+
CudaCtxPush lock(context);
712+
if (CUDA_SUCCESS != cuMemcpyDtoHAsync(pDstHost, pCudaBuffer->GpuMem(),
713+
pCudaBuffer->GetRawMemSize(), stream)) {
714+
return TASK_EXEC_FAIL;
715+
}
716+
717+
SetOutput(pImpl->pHostBuffer, 0);
718+
return TASK_EXEC_SUCCESS;
719+
}
720+
604721
namespace VPF {
605722
struct DemuxFrame_Impl {
606723
size_t videoBytes = 0U;

PyNvCodec/inc/PyNvCodec.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,23 @@ class PyFrameUploader {
8484
std::shared_ptr<Surface> UploadSingleFrame(py::array_t<float> &frame);
8585
};
8686

87+
class PyBufferUploader {
88+
std::unique_ptr<UploadBuffer> uploader;
89+
uint32_t elem_size, num_elems;
90+
91+
public:
92+
PyBufferUploader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID);
93+
94+
PyBufferUploader(uint32_t elemSize, uint32_t numElems, CUcontext ctx,
95+
CUstream str);
96+
97+
PyBufferUploader(uint32_t elemSize, uint32_t numElems,
98+
size_t ctx, size_t str) :
99+
PyBufferUploader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {}
100+
101+
std::shared_ptr<CudaBuffer> UploadSingleBuffer(py::array_t<uint8_t> &buffer);
102+
};
103+
87104
class PySurfaceDownloader {
88105
std::unique_ptr<CudaDownloadSurface> upDownloader;
89106
uint32_t surfaceWidth, surfaceHeight;
@@ -108,6 +125,24 @@ class PySurfaceDownloader {
108125
py::array_t<float> &frame);
109126
};
110127

128+
class PyCudaBufferDownloader {
129+
std::unique_ptr<DownloadCudaBuffer> upDownloader;
130+
uint32_t elem_size, num_elems;
131+
132+
public:
133+
PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID);
134+
135+
PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, CUcontext ctx,
136+
CUstream str);
137+
138+
PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems,
139+
size_t ctx, size_t str) :
140+
PyCudaBufferDownloader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {}
141+
142+
bool DownloadSingleCudaBuffer(std::shared_ptr<CudaBuffer> buffer,
143+
py::array_t<uint8_t> &np_array);
144+
};
145+
111146
class PySurfaceConverter {
112147
std::unique_ptr<ConvertSurface> upConverter;
113148
std::unique_ptr<Buffer> upCtxBuffer;

0 commit comments

Comments
 (0)