Skip to content

add some cuda conversion to avoid useless conversion #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 95 additions & 1 deletion camera/gstCamera.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ gstCamera::gstCamera()
mRingMutex = new QMutex();

mLatestRGBA = 0;
mLatestBGR8 = 0;
mLatestRingbuffer = 0;
mLatestRetrieved = false;

Expand All @@ -79,9 +80,11 @@ gstCamera::gstCamera()
mRingbufferCPU[n] = NULL;
mRingbufferGPU[n] = NULL;
mRGBA[n] = NULL;
mBGR8[n] = NULL;
}

mRGBAZeroCopy = false;
mBGR8ZeroCopy = false;
}


Expand Down Expand Up @@ -111,8 +114,99 @@ gstCamera::~gstCamera()

mRGBA[n] = NULL;
}
}
if( mBGR8[n] != NULL )
{
if( mBGR8ZeroCopy )
CUDA(cudaFreeHost(mBGR8[n]));
else
CUDA(cudaFree(mBGR8[n]));

mRGBA[n] = NULL;
}
}
}
bool gstCamera::ConvertBGR8( void* input, void** output, bool zeroCopy )
{
if( !input || !output )
return false;
if( mBGR8[0] != NULL && zeroCopy != mBGR8ZeroCopy )
{
for( uint32_t n=0; n < NUM_RINGBUFFERS; n++ )
{
if( mBGR8[n] != NULL )
{
if( mBGR8ZeroCopy )
CUDA(cudaFreeHost(mBGR8[n]));
else
CUDA(cudaFree(mBGR8[n]));

mBGR8[n] = NULL;
}
}

mBGR8ZeroCopy = false;
}

if( !mBGR8[0] )
{
const size_t size = mWidth * mHeight * sizeof(uchar3);

printf("%d %d %d\n",mWidth, mHeight, size);
for( uint32_t n=0; n < NUM_RINGBUFFERS; n++ )
{
if( zeroCopy )
{
void* cpuPtr = NULL;
void* gpuPtr = NULL;
if( !cudaAllocMapped(&cpuPtr, &gpuPtr, (size_t)size) )
{
printf(LOG_CUDA "gstCamera failed to allocate zeroCopy memory for %ux%xu BGR8 texture\n", mWidth, mHeight);
return false;
}
if( cpuPtr != gpuPtr )
{
printf(LOG_CUDA "gstCamera zeroCopy memory has different pointers, please use a UVA-compatible GPU\n");
return false;
}

mBGR8[n] = gpuPtr;
}
else
{
if( CUDA_FAILED(cudaMalloc(&mBGR8[n], (size_t)size)) )
{
printf(LOG_CUDA "gstCamera failed to allocate memory for %ux%u BGR8 texture\n", mWidth, mHeight);
return false;
}
}
}

printf(LOG_CUDA "gstreamer camera allocated %u RGBA ringbuffers\n", NUM_RINGBUFFERS);
mBGR8ZeroCopy = zeroCopy;
}

if( onboardCamera() )
{
// onboard camera is NV12
if( CUDA_FAILED(cudaNV12ToBGR8((uint8_t*)input, (uchar3*)mBGR8[mLatestBGR8], mWidth, mHeight)) )
{
return false;}
}
else
{
// USB webcam is RGB

if( CUDA_FAILED(cudaRGB8ToBGR8((uchar3*)input, (uchar3*)mBGR8[mLatestBGR8], mWidth, mHeight)) )
return false;
// memcpy((uchar3*)(mBGR8[mLatestBGR8]),(uchar3*)input,mWidth*mHeight*sizeof(uchar3));

}

*output = mBGR8[mLatestBGR8];
mLatestBGR8 = (mLatestBGR8 + 1) % NUM_RINGBUFFERS;
return true;
}



// onEOS
Expand Down
7 changes: 7 additions & 0 deletions camera/gstCamera.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ class gstCamera
// Takes in captured YUV-NV12 CUDA image, converts to float4 RGBA (with pixel intensity 0-255)
// Set zeroCopy to true if you need to access ConvertRGBA from CPU, otherwise it will be CUDA only.
bool ConvertRGBA( void* input, float** output, bool zeroCopy=false );

// Takes in captured YUV-NV12 CUDA image, converts to uint8 BGR (with pixel intensity 0-255)
// Set zeroCopy to true if you need to access ConvertBGR8 from CPU, otherwise it will be CUDA only.
bool ConvertBGR8( void* input, void** output, bool zeroCopy=false );

// Image dimensions
inline uint32_t GetWidth() const { return mWidth; }
Expand Down Expand Up @@ -126,11 +130,14 @@ class gstCamera
QMutex* mRingMutex;

uint32_t mLatestRGBA;
uint32_t mLatestBGR8;
uint32_t mLatestRingbuffer;
bool mLatestRetrieved;

void* mRGBA[NUM_RINGBUFFERS];
void* mBGR8[NUM_RINGBUFFERS];
bool mRGBAZeroCopy; // were the RGBA buffers allocated with zeroCopy?
bool mBGR8ZeroCopy;
bool mStreaming; // true if the device is currently open
int mV4L2Device; // -1 for onboard, >=0 for V4L2 device

Expand Down
36 changes: 36 additions & 0 deletions cuda/cudaRGB.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,28 @@ __global__ void RGBToRGBAf(uchar3* srcImage,
dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, 255.0f * s);
}

__global__ void RGB8ToBGR8(uchar3* srcImage,
uchar3* dstImage,
int width, int height)
{
const int x = (blockIdx.x * blockDim.x) + threadIdx.x;
const int y = (blockIdx.y * blockDim.y) + threadIdx.y;

const int pixel = y * width + x;

if( x >= width )
return;

if( y >= height )
return;

// printf("cuda thread %i %i %i %i pixel %i \n", x, y, width, height, pixel);

const uchar3 px = srcImage[pixel];

dstImage[pixel] = make_uchar3(px.z, px.y, px.x );
}

cudaError_t cudaBGR8ToRGBA32( uchar3* srcDev, float4* destDev, size_t width, size_t height )
{
if( !srcDev || !destDev )
Expand All @@ -76,6 +98,20 @@ cudaError_t cudaRGB8ToRGBA32( uchar3* srcDev, float4* destDev, size_t width, siz
return CUDA(cudaGetLastError());
}


cudaError_t cudaRGB8ToBGR8( uchar3* srcDev, uchar3* destDev, size_t width, size_t height )
{
if( !srcDev || !destDev )
return cudaErrorInvalidDevicePointer;

const dim3 blockDim(8,8,1);
const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1);

RGB8ToBGR8 <<<gridDim, blockDim>>>( srcDev, destDev, width, height );

return CUDA(cudaGetLastError());
}

//-------------------------------------------------------------------------------------------------------------------------
template<bool isBGRA>
__global__ void RGBAToRGBA8(float4* srcImage,
Expand Down
6 changes: 5 additions & 1 deletion cuda/cudaRGB.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ cudaError_t cudaRGBA32ToBGR8( float4* input, uchar3* output, size_t width, size_
*/
cudaError_t cudaRGBA32ToBGR8( float4* input, uchar3* output, size_t width, size_t height, const float2& inputRange );


/**
* Convert 8-bit fixed-point RGB mage into 8-bit fixed-point BGR image,
* with the 0-255 input range. Output range is 0-255.
*/
cudaError_t cudaRGB8ToBGR8( uchar3* srcDev, uchar3* destDev, size_t width, size_t height );
#endif

119 changes: 119 additions & 0 deletions cuda/cudaYUV-NV12.cu
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,97 @@ __global__ void NV12ToRGBAf(uint32_t* srcImage, size_t nSourcePitch,
}


__global__ void NV12ToBGR8(uint32_t* srcImage, size_t nSourcePitch,
uchar3* dstImage, size_t nDestPitch,
uint32_t width, uint32_t height)
{
int x, y;
uint32_t yuv101010Pel[2];
uint32_t processingPitch = ((width) + 63) & ~63;
uint8_t *srcImageU8 = (uint8_t *)srcImage;

processingPitch = nSourcePitch;

// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
y = blockIdx.y * blockDim.y + threadIdx.y;

if (x >= width)
return; //x = width - 1;

if (y >= height)
return; // y = height - 1;

#if 1
// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
// if we move to texture we could read 4 luminance values
yuv101010Pel[0] = (srcImageU8[y * processingPitch + x ]) << 2;
yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;

uint32_t chromaOffset = processingPitch * height;
int y_chroma = y >> 1;

if (y & 1) // odd scanline ?
{
uint32_t chromaCb;
uint32_t chromaCr;

chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x ];
chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];

if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
{
chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x ] + 1) >> 1;
chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
}

yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
else
{
yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2));
yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}

// this steps performs the color conversion
uint32_t yuvi[6];
float red[2], green[2], blue[2];

yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

// YUV to RGB Transformation conversion
YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);

// Clamp the results to RGBA
//printf("cuda thread %i %i %f %f %f\n", x, y, red[0], green[0], blue[0]);

const float s = 1.0f / 1024.0f * 255.0f;

dstImage[y * width + x] = make_uchar3(blue[0] * s, green[0] * s, red[0] * s);
dstImage[y * width + x + 1] = make_uchar3(blue[1] * s, green[1] * s, red[1] * s);
#else
//printf("cuda thread %i %i %i %i \n", x, y, width, height);

dstImage[y * width + x] = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
dstImage[y * width + x + 1] = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
#endif
}



// cudaNV12ToRGBA
cudaError_t cudaNV12ToRGBA32( uint8_t* srcDev, size_t srcPitch, float4* destDev, size_t destPitch, size_t width, size_t height )
Expand Down Expand Up @@ -390,6 +481,34 @@ cudaError_t cudaNV12ToRGBA32( uint8_t* srcDev, float4* destDev, size_t width, si
}



cudaError_t cudaNV12ToBGR8( uint8_t* srcDev, size_t srcPitch, uchar3* destDev, size_t destPitch, size_t width, size_t height )
{
if( !srcDev || !destDev )
return cudaErrorInvalidDevicePointer;

if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
return cudaErrorInvalidValue;

if( !nv12ColorspaceSetup )
cudaNV12SetupColorspace();

const dim3 blockDim(8,8,1);
//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);

NV12ToBGR8<<<gridDim, blockDim>>>( (uint32_t*)srcDev, srcPitch, destDev, destPitch, width, height );

return CUDA(cudaGetLastError());
}

cudaError_t cudaNV12ToBGR8( uint8_t* srcDev, uchar3* destDev, size_t width, size_t height )
{
return cudaNV12ToBGR8(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(float4), width, height);
}



// cudaNV12SetupColorspace
cudaError_t cudaNV12SetupColorspace( float hue )
{
Expand Down
8 changes: 8 additions & 0 deletions cuda/cudaYUV.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,14 @@ cudaError_t cudaNV12ToRGBA( uint8_t* input, uchar4* output, size_t width, size_t
cudaError_t cudaNV12ToRGBA32( uint8_t* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height );
cudaError_t cudaNV12ToRGBA32( uint8_t* input, float4* output, size_t width, size_t height );

/**
* Convert an NV12 texture (semi-planar 4:2:0) to BGR uchar4 format.
* NV12 = 8-bit Y plane followed by an interleaved U/V plane with 2x2 subsampling.
*/
cudaError_t cudaNV12ToBGR8( uint8_t* input, size_t inputPitch, uchar3* output, size_t outputPitch, size_t width, size_t height );
cudaError_t cudaNV12ToBGR8( uint8_t* input, uchar3* output, size_t width, size_t height );


/**
* Setup NV12 color conversion constants.
* cudaNV12SetupColorspace() isn't necessary for the user to call, it will be
Expand Down