From d04cedb48c3902a5d4213d324abe62220aca861a Mon Sep 17 00:00:00 2001 From: JetsonHacks Date: Tue, 12 Mar 2019 04:57:44 +0000 Subject: [PATCH] Add BGRA Support (ZED Camera) --- CMakeLists.txt | 37 ++++++++++++-- src/cudaBGR.cu | 48 ++++++++++++++++++ src/cudaBGR.h | 15 ++++++ src/cudaUtility.h | 108 ++++++++++++++++++++++++++++++++++++++++ src/image_converter.cpp | 58 ++++++++++++++------- 5 files changed, 243 insertions(+), 23 deletions(-) create mode 100644 src/cudaBGR.cu create mode 100644 src/cudaBGR.h create mode 100644 src/cudaUtility.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 877561f..102100f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,20 +36,49 @@ include_directories( # enable c++11 (TensorRT requirement) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +# setup CUDA +find_package(CUDA) +message("-- CUDA version: ${CUDA_VERSION}") + +set( + CUDA_NVCC_FLAGS + ${CUDA_NVCC_FLAGS}; + -O3 + -gencode arch=compute_37,code=sm_37 + -gencode arch=compute_53,code=sm_53 + -gencode arch=compute_60,code=sm_60 + -gencode arch=compute_61,code=sm_61 + -gencode arch=compute_62,code=sm_62 +) + +if(CUDA_VERSION_MAJOR GREATER 9) + message("-- CUDA ${CUDA_VERSION_MAJOR} detected, enabling SM_72") + + set( + CUDA_NVCC_FLAGS + ${CUDA_NVCC_FLAGS}; + -gencode arch=compute_72,code=sm_72 + ) +endif() + # inference nodes -add_executable(imagenet src/node_imagenet.cpp src/image_converter.cpp) +#add_executable(imagenet src/node_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp ) +cuda_add_executable(imagenet src/node_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp ) target_link_libraries(imagenet ${catkin_LIBRARIES} jetson-inference) -add_executable(detectnet src/node_detectnet.cpp src/image_converter.cpp) +#add_executable(detectnet src/node_detectnet.cpp src/cudaBGR.cu src/image_converter.cpp) +cuda_add_executable(detectnet src/node_detectnet.cpp src/cudaBGR.cu src/image_converter.cpp) target_link_libraries(detectnet ${catkin_LIBRARIES} jetson-inference) -add_executable(segnet src/node_segnet.cpp src/image_converter.cpp) +#add_executable(segnet src/node_segnet.cpp src/cudaBGR.cu src/image_converter.cpp) +cuda_add_executable(segnet src/node_segnet.cpp src/cudaBGR.cu src/image_converter.cpp) target_link_libraries(segnet ${catkin_LIBRARIES} jetson-inference) # the library we are going to compile (has all of the nodelets in it) -add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/image_converter.cpp) +#add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp) +cuda_add_library(ros_deep_learning_nodelets src/nodelet_imagenet.cpp src/cudaBGR.cu src/image_converter.cpp) target_link_libraries(ros_deep_learning_nodelets ${catkin_LIBRARIES} jetson-inference) diff --git a/src/cudaBGR.cu b/src/cudaBGR.cu new file mode 100644 index 0000000..fcc547a --- /dev/null +++ b/src/cudaBGR.cu @@ -0,0 +1,48 @@ +#include "cudaBGR.h" + +//------------------------------------------------------------------------------------------------------------------------- + +template +__global__ void RGBAToRGBAf(uchar4* srcImage, + float4* dstImage, + int width, int height) +{ + const int x = (blockIdx.x * blockDim.x) + threadIdx.x; + const int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + const int pixel = y * width + x; + + if( x >= width ) + return; + + if( y >= height ) + return; + +// printf("cuda thread %i %i %i %i pixel %i \n", x, y, width, height, pixel); + + const float s = 1.0f; + const uchar4 px = srcImage[pixel]; + + if( isBGR ) + dstImage[pixel] = make_float4(px.z * s, px.y * s, px.x * s, px.w * s); + else + dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, px.w * s); +} + +/** + * Convert 8-bit fixed-point BGRA image to 32-bit floating-point RGBA image + * @ingroup util + */ +cudaError_t cudaBGRA8ToRGBA32( uchar4* srcDev, float4* destDev, size_t width, size_t height ) +{ + if( !srcDev || !destDev ) + return cudaErrorInvalidDevicePointer; + + const dim3 blockDim(8,8,1); + const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1); + + RGBAToRGBAf<<>>( srcDev, destDev, width, height ); + + return CUDA(cudaGetLastError()); +} + diff --git a/src/cudaBGR.h b/src/cudaBGR.h new file mode 100644 index 0000000..bf2e763 --- /dev/null +++ b/src/cudaBGR.h @@ -0,0 +1,15 @@ +#ifndef __CUDA_BGR_CONVERT_H +#define __CUDA_BGR_CONVERT_H + + +#include "cudaUtility.h" +#include + + +/** + * Convert 8-bit fixed-point BGR image to 32-bit floating-point RGBA image + * @ingroup util + */ +cudaError_t cudaBGRA8ToRGBA32( uchar4* input, float4* output, size_t width, size_t height ); + +#endif diff --git a/src/cudaUtility.h b/src/cudaUtility.h new file mode 100644 index 0000000..467e0b7 --- /dev/null +++ b/src/cudaUtility.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __CUDA_UTILITY_H_ +#define __CUDA_UTILITY_H_ + + +#include +#include +#include +#include + + +/** + * Execute a CUDA call and print out any errors + * @return the original cudaError_t result + * @ingroup util + */ +#define CUDA(x) cudaCheckError((x), #x, __FILE__, __LINE__) + +/** + * Evaluates to true on success + * @ingroup util + */ +#define CUDA_SUCCESS(x) (CUDA(x) == cudaSuccess) + +/** + * Evaluates to true on failure + * @ingroup util + */ +#define CUDA_FAILED(x) (CUDA(x) != cudaSuccess) + +/** + * Return from the boolean function if CUDA call fails + * @ingroup util + */ +#define CUDA_VERIFY(x) if(CUDA_FAILED(x)) return false; + +/** + * LOG_CUDA string. + * @ingroup util + */ +#define LOG_CUDA "[cuda] " + +/* + * define this if you want all cuda calls to be printed + */ +//#define CUDA_TRACE + + + +/** + * cudaCheckError + * @ingroup util + */ +inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line ) +{ +#if !defined(CUDA_TRACE) + if( retval == cudaSuccess) + return cudaSuccess; +#endif + + //int activeDevice = -1; + //cudaGetDevice(&activeDevice); + + //Log("[cuda] device %i - %s\n", activeDevice, txt); + + printf(LOG_CUDA "%s\n", txt); + + + if( retval != cudaSuccess ) + { + printf(LOG_CUDA " %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval); + printf(LOG_CUDA " %s:%i\n", file, line); + } + + return retval; +} + + +/** + * iDivUp + * @ingroup util + */ +inline __device__ __host__ int iDivUp( int a, int b ) { return (a % b != 0) ? (a / b + 1) : (a / b); } + + + +#endif diff --git a/src/image_converter.cpp b/src/image_converter.cpp index ee14afe..69c20be 100644 --- a/src/image_converter.cpp +++ b/src/image_converter.cpp @@ -28,6 +28,7 @@ #include #include +#include "cudaBGR.h" // constructor imageConverter::imageConverter() @@ -55,16 +56,22 @@ imageConverter::~imageConverter() bool imageConverter::Convert( const sensor_msgs::ImageConstPtr& input ) { ROS_INFO("converting %ux%u %s image", input->width, input->height, input->encoding.c_str()); - - // confirm bgr8 encoding - if( input->encoding != sensor_msgs::image_encodings::BGR8 ) - { - ROS_ERROR("%ux%u image is in %s format, expected %s", input->width, input->height, input->encoding.c_str(), sensor_msgs::image_encodings::BGR8.c_str()); + if( input->encoding != sensor_msgs::image_encodings::BGR8 && + input->encoding != sensor_msgs::image_encodings::BGRA8 ) + { + ROS_ERROR("%ux%u image is in %s format, expected BGR8 or BGRA8", input->width, input->height, input->encoding.c_str()); return false; - } - - // confirm step size - const uint32_t input_stride = input->width * sizeof(uchar3); + } + // Default to uchar3 + bool isBGRA8 = false ; + size_t pixel_byte_size = sizeof(uchar3) ; + if (input->encoding == sensor_msgs::image_encodings::BGRA8) { + pixel_byte_size = sizeof(uchar4) ; + isBGRA8 = true; + } + + // confirm step size + const uint32_t input_stride = input->width * pixel_byte_size; if( input->step != input_stride ) { @@ -72,21 +79,34 @@ bool imageConverter::Convert( const sensor_msgs::ImageConstPtr& input ) return false; } - // assure memory allocation + // assure memory allocation if( !Resize(input->width, input->height) ) return false; // copy input to shared memory - memcpy(mInputCPU, input->data.data(), input->width * input->height * sizeof(uchar3)); // note: 3 channels assumes bgr/rgb - - // convert to RGBA32f format - if( CUDA_FAILED(cudaBGR8ToRGBA32((uchar3*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) ) - { - ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight); - return false; - } + memcpy(mInputCPU, input->data.data(), input->width * input->height * pixel_byte_size); + + + // convert to RGBA32f format + if (!isBGRA8) + { // Should be BGR8 - convert to RGBA32 + if( CUDA_FAILED(cudaBGR8ToRGBA32((uchar3*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) ) + { + ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight); + return false; + } + } + else { + // Should be BGRA8 - convert to RGBA32 + if( CUDA_FAILED(cudaBGRA8ToRGBA32((uchar4*)mInputGPU, (float4*)mOutputGPU, mWidth, mHeight)) ) + { + ROS_ERROR("failed to convert %ux%u image with CUDA", mWidth, mHeight); + return false; + } + } + + return true ; - return true; }