dusty-nv · jetsonhacks · Mar 12, 2019
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -36,20 +36,49 @@ include_directories(
 # enable c++11 (TensorRT requirement)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
+# setup CUDA
+find_package(CUDA)
+message("-- CUDA version: ${CUDA_VERSION}")
+
+set(
+  CUDA_NVCC_FLAGS
+  ${CUDA_NVCC_FLAGS};
+  -O3
+  -gencode arch=compute_37,code=sm_37
+  -gencode arch=compute_53,code=sm_53
+  -gencode arch=compute_60,code=sm_60
+  -gencode arch=compute_61,code=sm_61
+  -gencode arch=compute_62,code=sm_62
+)
+
+if(CUDA_VERSION_MAJOR GREATER 9)
+  message("-- CUDA ${CUDA_VERSION_MAJOR} detected, enabling SM_72")
+
+  set(
+   CUDA_NVCC_FLAGS
+   ${CUDA_NVCC_FLAGS};
+   -gencode arch=compute_72,code=sm_72
+   )
+endif()
+
 
 # inference nodes
-add_executable(imagenet src/node_imagenet.cpp src/image_converter.cpp)
+#add_executable(imagenet src/node_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp )
+cuda_add_executable(imagenet src/node_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp )
 target_link_libraries(imagenet ${catkin_LIBRARIES} jetson-inference)
 
-add_executable(detectnet src/node_detectnet.cpp src/image_converter.cpp)
+#add_executable(detectnet src/node_detectnet.cpp src/cudaBGR.cu src/image_converter.cpp)
+cuda_add_executable(detectnet src/node_detectnet.cpp src/cudaBGR.cu src/image_converter.cpp)
 target_link_libraries(detectnet ${catkin_LIBRARIES} jetson-inference)
 
-add_executable(segnet src/node_segnet.cpp src/image_converter.cpp)
+#add_executable(segnet src/node_segnet.cpp src/cudaBGR.cu src/image_converter.cpp)
+cuda_add_executable(segnet src/node_segnet.cpp src/cudaBGR.cu src/image_converter.cpp)
 target_link_libraries(segnet ${catkin_LIBRARIES} jetson-inference)
 
 
 # the library we are going to compile (has all of the nodelets in it)
-add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/image_converter.cpp)
+#add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp)
+cuda_add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp)
 target_link_libraries(ros_deep_learning_nodelets ${catkin_LIBRARIES} jetson-inference)
 
 

diff --git a/src/cudaBGR.cu b/src/cudaBGR.cu
@@ -0,0 +1,48 @@
+#include "cudaBGR.h"
+
+//-------------------------------------------------------------------------------------------------------------------------
+
+template<bool isBGR>
+__global__ void RGBAToRGBAf(uchar4* srcImage,
+                           float4* dstImage,
+                           int width, int height)
+{
+	const int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	const int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	const int pixel = y * width + x;
+
+	if( x >= width )
+		return; 
+
+	if( y >= height )
+		return;
+
+//	printf("cuda thread %i %i  %i %i pixel %i \n", x, y, width, height, pixel);
+
+	const float  s  = 1.0f;
+	const uchar4 px = srcImage[pixel];
+
+	if( isBGR )
+		dstImage[pixel] = make_float4(px.z * s, px.y * s, px.x * s, px.w * s);
+	else
+		dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, px.w * s);
+}
+
+/**
+ * Convert 8-bit fixed-point BGRA image to 32-bit floating-point RGBA image
+ * @ingroup util
+ */
+cudaError_t cudaBGRA8ToRGBA32( uchar4* srcDev, float4* destDev, size_t width, size_t height )
+{
+	if( !srcDev || !destDev )
+		return cudaErrorInvalidDevicePointer;
+
+	const dim3 blockDim(8,8,1);
+	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1);
+
+	RGBAToRGBAf<true><<<gridDim, blockDim>>>( srcDev, destDev, width, height );
+
+	return CUDA(cudaGetLastError());
+}
+
diff --git a/src/cudaBGR.h b/src/cudaBGR.h
@@ -0,0 +1,15 @@
+#ifndef __CUDA_BGR_CONVERT_H
+#define __CUDA_BGR_CONVERT_H
+
+
+#include "cudaUtility.h"
+#include <stdint.h>
+
+
+/**
+ * Convert 8-bit fixed-point BGR image to 32-bit floating-point RGBA image
+ * @ingroup util
+ */
+cudaError_t cudaBGRA8ToRGBA32( uchar4* input, float4* output, size_t width, size_t height );
+
+#endif
diff --git a/src/cudaUtility.h b/src/cudaUtility.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __CUDA_UTILITY_H_
+#define __CUDA_UTILITY_H_
+
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <string.h>
+
+
+/**
+ * Execute a CUDA call and print out any errors
+ * @return the original cudaError_t result
+ * @ingroup util
+ */
+#define CUDA(x)				cudaCheckError((x), #x, __FILE__, __LINE__)
+
+/**
+ * Evaluates to true on success
+ * @ingroup util
+ */
+#define CUDA_SUCCESS(x)			(CUDA(x) == cudaSuccess)
+
+/**
+ * Evaluates to true on failure
+ * @ingroup util
+ */
+#define CUDA_FAILED(x)			(CUDA(x) != cudaSuccess)
+
+/**
+ * Return from the boolean function if CUDA call fails
+ * @ingroup util
+ */
+#define CUDA_VERIFY(x)			if(CUDA_FAILED(x))	return false;
+
+/**
+ * LOG_CUDA string.
+ * @ingroup util
+ */
+#define LOG_CUDA "[cuda]   "
+
+/*
+ * define this if you want all cuda calls to be printed
+ */
+//#define CUDA_TRACE
+
+
+
+/**
+ * cudaCheckError
+ * @ingroup util
+ */
+inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line )
+{
+#if !defined(CUDA_TRACE)
+	if( retval == cudaSuccess)
+		return cudaSuccess;
+#endif
+
+	//int activeDevice = -1;
+	//cudaGetDevice(&activeDevice);
+
+	//Log("[cuda]   device %i  -  %s\n", activeDevice, txt);
+
+	printf(LOG_CUDA "%s\n", txt);
+
+
+	if( retval != cudaSuccess )
+	{
+		printf(LOG_CUDA "   %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval);
+		printf(LOG_CUDA "   %s:%i\n", file, line);	
+	}
+
+	return retval;
+}
+
+
+/**
+ * iDivUp
+ * @ingroup util
+ */
+inline __device__ __host__ int iDivUp( int a, int b )  		{ return (a % b != 0) ? (a / b + 1) : (a / b); }
+
+
+
+#endif
diff --git a/src/image_converter.cpp b/src/image_converter.cpp
@@ -28,6 +28,7 @@
 #include <ros/ros.h>
 #include <sensor_msgs/image_encodings.h>
 
+#include "cudaBGR.h"
 
 // constructor
 imageConverter::imageConverter()
@@ -55,38 +56,57 @@ imageConverter::~imageConverter()
 bool imageConverter::Convert( const sensor_msgs::ImageConstPtr& input )
 {
 	ROS_INFO("converting %ux%u %s image", input->width, input->height, input->encoding.c_str());
-
-	// confirm bgr8 encoding
-	if( input->encoding != sensor_msgs::image_encodings::BGR8 )
-	{
-		ROS_ERROR("%ux%u image is in %s format, expected %s", input->width, input->height, input->encoding.c_str(), sensor_msgs::image_encodings::BGR8.c_str());
+	if( input->encoding != sensor_msgs::image_encodings::BGR8 &&  
+            input->encoding != sensor_msgs::image_encodings::BGRA8 )
+        {
+	    ROS_ERROR("%ux%u image is in %s format, expected BGR8 or BGRA8", input->width, input->height, input->encoding.c_str());
 		return false;
-	}
-
-	// confirm step size
-	const uint32_t input_stride = input->width * sizeof(uchar3);
+        }
+       // Default to uchar3
+       bool isBGRA8 = false ;
+       size_t pixel_byte_size = sizeof(uchar3) ;
+       if (input->encoding == sensor_msgs::image_encodings::BGRA8) {
+         pixel_byte_size = sizeof(uchar4) ;
+         isBGRA8 = true;
+       }
+
+       // confirm step size
+       const uint32_t input_stride = input->width * pixel_byte_size;
 
 	if( input->step != input_stride )
 	{
 		ROS_ERROR("%ux%u image has step size of %u bytes, expected %u bytes", input->width, input->height, input->step, input_stride);
 		return false;
 	}
 
-	// assure memory allocation
+        // assure memory allocation
 	if( !Resize(input->width, input->height) )
 		return false;
 
 	// copy input to shared memory
-	memcpy(mInputCPU, input->data.data(), input->width * input->height * sizeof(uchar3));	// note: 3 channels assumes bgr/rgb			
-
-	// convert to RGBA32f format
-	if( CUDA_FAILED(cudaBGR8ToRGBA32((uchar3*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) )
-	{
-		ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight);
-		return false;
-	}
+	memcpy(mInputCPU, input->data.data(), input->width * input->height * pixel_byte_size);
+
+
+        // convert to RGBA32f format
+        if (!isBGRA8)
+         { // Should be BGR8 - convert to RGBA32
+	   if( CUDA_FAILED(cudaBGR8ToRGBA32((uchar3*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) )
+	    {
+	      ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight);
+	      return false;
+	    }
+         }
+         else {
+           // Should be BGRA8 - convert to RGBA32
+           if( CUDA_FAILED(cudaBGRA8ToRGBA32((uchar4*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) )
+	      {
+		  ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight);
+		  return false;
+	      }
+         }
+
+       return true ;
 
-	return true;
 }