dusty-nv · nicolas-beaufort · Jun 4, 2019
diff --git a/camera/gstCamera.cpp b/camera/gstCamera.cpp
@@ -71,6 +71,7 @@ gstCamera::gstCamera()
 	mRingMutex  = new QMutex();
 
 	mLatestRGBA       = 0;
+	mLatestBGR8	  = 0;
 	mLatestRingbuffer = 0;
 	mLatestRetrieved  = false;
 
@@ -79,9 +80,11 @@ gstCamera::gstCamera()
 		mRingbufferCPU[n] = NULL;
 		mRingbufferGPU[n] = NULL;
 		mRGBA[n]          = NULL;
+		mBGR8[n] 	  = NULL;
 	}
 
 	mRGBAZeroCopy = false;
+	mBGR8ZeroCopy = false;
 }
 
 
@@ -111,8 +114,99 @@ gstCamera::~gstCamera()
 
 			mRGBA[n] = NULL; 
 		}
-	}
+		if( mBGR8[n] != NULL )
+                {
+                        if( mBGR8ZeroCopy )
+                                CUDA(cudaFreeHost(mBGR8[n]));
+                        else
+                                CUDA(cudaFree(mBGR8[n]));
+
+                        mRGBA[n] = NULL;
+                }
+	}	
 }
+bool gstCamera::ConvertBGR8( void* input, void** output, bool zeroCopy )
+{
+        if( !input || !output )
+                return false;
+        if( mBGR8[0] != NULL && zeroCopy != mBGR8ZeroCopy )
+        {
+                for( uint32_t n=0; n < NUM_RINGBUFFERS; n++ )
+                {
+                        if( mBGR8[n] != NULL )
+                        {
+                                if( mBGR8ZeroCopy )
+                                        CUDA(cudaFreeHost(mBGR8[n]));
+                                else
+                                        CUDA(cudaFree(mBGR8[n]));
+
+                                mBGR8[n] = NULL;
+                        }
+                }
+
+                mBGR8ZeroCopy = false; 
+        }
+
+        if( !mBGR8[0] )
+        {
+                const size_t size = mWidth * mHeight * sizeof(uchar3);
+
+                printf("%d %d %d\n",mWidth, mHeight, size);
+		for( uint32_t n=0; n < NUM_RINGBUFFERS; n++ )
+                {
+                        if( zeroCopy )
+                        {
+                                void* cpuPtr = NULL;
+                                void* gpuPtr = NULL;
+                                if( !cudaAllocMapped(&cpuPtr, &gpuPtr, (size_t)size) )
+                                {
+                                        printf(LOG_CUDA "gstCamera  failed to allocate zeroCopy memory for %ux%xu BGR8 texture\n", mWidth, mHeight);
+                                        return false;
+                                }
+                                if( cpuPtr != gpuPtr )
+                                {
+                                        printf(LOG_CUDA "gstCamera  zeroCopy memory has different pointers, please use a UVA-compatible GPU\n");
+                                        return false;
+                                }
+
+                                mBGR8[n] = gpuPtr;
+                        }
+                        else
+                        {
+                                if( CUDA_FAILED(cudaMalloc(&mBGR8[n], (size_t)size)) )
+                                {
+                                        printf(LOG_CUDA "gstCamera  failed to allocate memory for %ux%u BGR8 texture\n", mWidth, mHeight);
+                                        return false;
+                                }
+                        }
+                }
+
+                printf(LOG_CUDA "gstreamer camera  allocated %u RGBA ringbuffers\n", NUM_RINGBUFFERS);
+		mBGR8ZeroCopy = zeroCopy;
+        }
+
+        if( onboardCamera() )
+        {
+                // onboard camera is NV12
+                if( CUDA_FAILED(cudaNV12ToBGR8((uint8_t*)input, (uchar3*)mBGR8[mLatestBGR8], mWidth, mHeight)) )
+		{
+		    	return false;}
+        }
+        else
+        {
+                // USB webcam is RGB
+
+                if( CUDA_FAILED(cudaRGB8ToBGR8((uchar3*)input, (uchar3*)mBGR8[mLatestBGR8], mWidth, mHeight)) )
+                        return false;
+		// memcpy((uchar3*)(mBGR8[mLatestBGR8]),(uchar3*)input,mWidth*mHeight*sizeof(uchar3));
+
+        }
+
+        *output     = mBGR8[mLatestBGR8];
+        mLatestBGR8 = (mLatestBGR8 + 1) % NUM_RINGBUFFERS;
+        return true;
+}
+
 
 
 // onEOS

diff --git a/camera/gstCamera.h b/camera/gstCamera.h
@@ -80,6 +80,10 @@ class gstCamera
 	// Takes in captured YUV-NV12 CUDA image, converts to float4 RGBA (with pixel intensity 0-255)
 	// Set zeroCopy to true if you need to access ConvertRGBA from CPU, otherwise it will be CUDA only.
 	bool ConvertRGBA( void* input, float** output, bool zeroCopy=false );
+
+	// Takes in captured YUV-NV12 CUDA image, converts to uint8 BGR (with pixel intensity 0-255)
+	// Set zeroCopy to true if you need to access ConvertBGR8 from CPU, otherwise it will be CUDA only.
+	bool ConvertBGR8( void* input, void** output, bool zeroCopy=false );	
 
 	// Image dimensions
 	inline uint32_t GetWidth() const	   { return mWidth; }
@@ -126,11 +130,14 @@ class gstCamera
 	QMutex* mRingMutex;
 
 	uint32_t mLatestRGBA;
+	uint32_t mLatestBGR8;
 	uint32_t mLatestRingbuffer;
 	bool     mLatestRetrieved;
 
 	void*  mRGBA[NUM_RINGBUFFERS];
+	void*  mBGR8[NUM_RINGBUFFERS];
 	bool   mRGBAZeroCopy; // were the RGBA buffers allocated with zeroCopy?
+	bool   mBGR8ZeroCopy;
 	bool   mStreaming;	  // true if the device is currently open
 	int    mV4L2Device;	  // -1 for onboard, >=0 for V4L2 device
 

diff --git a/cuda/cudaRGB.cu b/cuda/cudaRGB.cu
@@ -50,6 +50,28 @@ __global__ void RGBToRGBAf(uchar3* srcImage,
 		dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, 255.0f * s);
 }
 
+__global__ void RGB8ToBGR8(uchar3* srcImage,
+                           uchar3* dstImage,
+                           int width, int height)
+{
+        const int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+        const int pixel = y * width + x;
+
+        if( x >= width )
+                return;
+
+        if( y >= height )
+                return;
+
+//      printf("cuda thread %i %i  %i %i pixel %i \n", x, y, width, height, pixel);
+
+        const uchar3 px = srcImage[pixel];
+
+                dstImage[pixel] = make_uchar3(px.z, px.y, px.x );
+}
+
 cudaError_t cudaBGR8ToRGBA32( uchar3* srcDev, float4* destDev, size_t width, size_t height )
 {
 	if( !srcDev || !destDev )
@@ -76,6 +98,20 @@ cudaError_t cudaRGB8ToRGBA32( uchar3* srcDev, float4* destDev, size_t width, siz
 	return CUDA(cudaGetLastError());
 }
 
+
+cudaError_t cudaRGB8ToBGR8( uchar3* srcDev, uchar3* destDev, size_t width, size_t height )
+{
+        if( !srcDev || !destDev )
+                return cudaErrorInvalidDevicePointer;
+
+        const dim3 blockDim(8,8,1);
+        const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1);
+
+        RGB8ToBGR8 <<<gridDim, blockDim>>>( srcDev, destDev, width, height );
+
+        return CUDA(cudaGetLastError());
+}
+
 //-------------------------------------------------------------------------------------------------------------------------
 template<bool isBGRA>
 __global__ void RGBAToRGBA8(float4* srcImage,

diff --git a/cuda/cudaRGB.h b/cuda/cudaRGB.h
@@ -97,6 +97,10 @@ cudaError_t cudaRGBA32ToBGR8( float4* input, uchar3* output, size_t width, size_
  */
 cudaError_t cudaRGBA32ToBGR8( float4* input, uchar3* output, size_t width, size_t height, const float2& inputRange );
 
-
+/**
+ * Convert 8-bit fixed-point RGB mage into 8-bit fixed-point BGR image,
+ * with the 0-255 input range.   Output range is 0-255.
+ */
+cudaError_t cudaRGB8ToBGR8( uchar3* srcDev, uchar3* destDev, size_t width, size_t height );
 #endif
 
diff --git a/cuda/cudaYUV-NV12.cu b/cuda/cudaYUV-NV12.cu
@@ -362,6 +362,97 @@ __global__ void NV12ToRGBAf(uint32_t* srcImage,  size_t nSourcePitch,
 }
 
 
+__global__ void NV12ToBGR8(uint32_t* srcImage,  size_t nSourcePitch,
+                           uchar3* dstImage,     size_t nDestPitch,
+                           uint32_t width,       uint32_t height)
+{
+    int x, y;
+    uint32_t yuv101010Pel[2];
+    uint32_t processingPitch = ((width) + 63) & ~63;
+    uint8_t *srcImageU8     = (uint8_t *)srcImage;
+
+    processingPitch = nSourcePitch;
+
+    // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+    x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+    y = blockIdx.y *  blockDim.y       +  threadIdx.y;
+
+    if (x >= width)
+        return; //x = width - 1;
+
+    if (y >= height)
+        return; // y = height - 1;
+
+#if 1
+    // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+    // if we move to texture we could read 4 luminance values
+    yuv101010Pel[0] = (srcImageU8[y * processingPitch + x    ]) << 2;
+    yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
+
+    uint32_t chromaOffset    = processingPitch * height;
+    int y_chroma = y >> 1;
+
+    if (y & 1)  // odd scanline ?
+    {
+        uint32_t chromaCb;
+        uint32_t chromaCr;
+
+        chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x    ];
+        chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
+
+        if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+        {
+            chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x    ] + 1) >> 1;
+            chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
+        }
+
+        yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+        yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+    }
+    else
+    {
+yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+        yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
+        yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+    }
+
+    // this steps performs the color conversion
+    uint32_t yuvi[6];
+    float red[2], green[2], blue[2];
+
+    yuvi[0] = (yuv101010Pel[0] &   COLOR_COMPONENT_MASK);
+    yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+    yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+    yuvi[3] = (yuv101010Pel[1] &   COLOR_COMPONENT_MASK);
+    yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+    yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+    // YUV to RGB Transformation conversion
+    YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
+    YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+    // Clamp the results to RGBA
+        //printf("cuda thread %i %i  %f %f %f\n", x, y, red[0], green[0], blue[0]);
+
+        const float s = 1.0f / 1024.0f * 255.0f;
+
+        dstImage[y * width + x]     = make_uchar3(blue[0] * s, green[0] * s, red[0] * s);
+        dstImage[y * width + x + 1] = make_uchar3(blue[1] * s, green[1] * s, red[1] * s);
+#else
+        //printf("cuda thread %i %i  %i %i \n", x, y, width, height);
+
+        dstImage[y * width + x]     = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
+        dstImage[y * width + x + 1] = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
+#endif
+}
+
+
 
 // cudaNV12ToRGBA
 cudaError_t cudaNV12ToRGBA32( uint8_t* srcDev, size_t srcPitch, float4* destDev, size_t destPitch, size_t width, size_t height )
@@ -390,6 +481,34 @@ cudaError_t cudaNV12ToRGBA32( uint8_t* srcDev, float4* destDev, size_t width, si
 }
 
 
+
+cudaError_t cudaNV12ToBGR8( uint8_t* srcDev, size_t srcPitch, uchar3* destDev, size_t destPitch, size_t width, size_t height )
+{
+        if( !srcDev || !destDev )
+                return cudaErrorInvalidDevicePointer;
+
+        if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
+                return cudaErrorInvalidValue;
+
+        if( !nv12ColorspaceSetup )
+                cudaNV12SetupColorspace();
+
+        const dim3 blockDim(8,8,1);
+        //const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
+        const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);
+
+        NV12ToBGR8<<<gridDim, blockDim>>>( (uint32_t*)srcDev, srcPitch, destDev, destPitch, width, height );
+
+        return CUDA(cudaGetLastError());
+}
+
+cudaError_t cudaNV12ToBGR8( uint8_t* srcDev, uchar3* destDev, size_t width, size_t height )
+{
+        return cudaNV12ToBGR8(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(float4), width, height);
+}
+
+
+
 // cudaNV12SetupColorspace
 cudaError_t cudaNV12SetupColorspace( float hue )
 {

diff --git a/cuda/cudaYUV.h b/cuda/cudaYUV.h
@@ -135,6 +135,14 @@ cudaError_t cudaNV12ToRGBA( uint8_t* input, uchar4* output, size_t width, size_t
 cudaError_t cudaNV12ToRGBA32( uint8_t* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height );
 cudaError_t cudaNV12ToRGBA32( uint8_t* input, float4* output, size_t width, size_t height );
 
+/**
+ * Convert an NV12 texture (semi-planar 4:2:0) to BGR uchar4 format.
+ * NV12 = 8-bit Y plane followed by an interleaved U/V plane with 2x2 subsampling.
+ */
+cudaError_t cudaNV12ToBGR8( uint8_t* input, size_t inputPitch, uchar3* output, size_t outputPitch, size_t width, size_t height );
+cudaError_t cudaNV12ToBGR8( uint8_t* input, uchar3* output, size_t width, size_t height );
+
+
 /**
  * Setup NV12 color conversion constants.
  * cudaNV12SetupColorspace() isn't necessary for the user to call, it will be