diff --git a/mmros/include/mmros/process/image.hpp b/mmros/include/mmros/process/image.hpp index 5399f3f..047f00b 100644 --- a/mmros/include/mmros/process/image.hpp +++ b/mmros/include/mmros/process/image.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 TIER IV, Inc. +// Copyright 2025 Kotaro Uetake. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,189 +15,28 @@ #ifndef MMROS__PROCESS__IMAGE_HPP_ #define MMROS__PROCESS__IMAGE_HPP_ -#include -#include -#include -#include -#include +#include "mmros/process/image_kernel.hpp" -namespace mmros::process -{ -struct Roi -{ - int x; - int y; - int w; - int h; -}; - -/** - * @brief Resize a image using bilinear interpolation on gpus - * @param[out] dst Resized image - * @param[in] src image - * @param[in] d_w width for resized image - * @param[in] d_h height for resized image - * @param[in] d_c channel for resized image - * @param[in] s_w width for input image - * @param[in] s_h height for input image - * @param[in] s_c channel for input image - * @param[in] stream cuda stream - */ -extern void resize_bilinear_gpu( - unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, - cudaStream_t stream); - -/** - * @brief Letterbox a image on gpus - * @param[out] dst letterbox-ed image - * @param[in] src image - * @param[in] d_w width for letterbox-ing - * @param[in] d_h height for letterbox-ing - * @param[in] d_c channel for letterbox-ing - * @param[in] s_w width for input image - * @param[in] s_h height for input image - * @param[in] s_c channel for input image - * @param[in] stream cuda stream - */ -extern void letterbox_gpu( - unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, - cudaStream_t stream); - -/** - * @brief NHWC to NHWC conversion - * @param[out] dst converted image - * @param[in] src image - * @param[in] d_w width for a image - * @param[in] d_h height for a image - * @param[in] d_c channel for a image - * @param[in] stream cuda stream - */ -extern void nchw_to_nhwc_gpu( - unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); - -/** - * @brief Unsigned char to float32 for inference - * @param[out] dst32 converted image - * @param[in] src image - * @param[in] d_w width for a image - * @param[in] d_h height for a image - * @param[in] d_c channel for a image - * @param[in] stream cuda stream - */ -extern void to_float_gpu( - float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); - -/** - * @brief Resize and letterbox a image using bilinear interpolation on gpus - * @param[out] dst processed image - * @param[in] src image - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] d_c channel for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] stream cuda stream - */ -extern void resize_bilinear_letterbox_gpu( - unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, - cudaStream_t stream); - -/** - * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization - * for YOLOX on gpus - * @param[out] dst processed image - * @param[in] src image - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] d_c channel for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] mean mean values for each channel - * @param[in] std std values for each channel - * @param[in] stream cuda stream - */ -extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu( - float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, - float * mean, float * std, cudaStream_t stream); - -/** - * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization - * with batching for YOLOX on gpus - * @param[out] dst processed image - * @param[in] src image - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] d_c channel for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] batch batch size - * @param[in] mean mean values for each channel - * @param[in] std std values for each channel - * @param[in] stream cuda stream - */ -extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, int batch, - float * mean, float * std, cudaStream_t stream); +#include -/** - * @brief Optimized preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat and - * normalization with batching for YOLOX on gpus - * @param[out] dst processed image - * @param[in] src image - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] d_c channel for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] d_roi regions of interest for cropping - * @param[in] batch batch size - * @param[in] mean mean values for each channel - * @param[in] std std values for each channel - * @param[in] stream cuda stream - */ -extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, - int s_c, int batch, float * mean, float * std, cudaStream_t stream); - -/** - * @brief Optimized multi-scale preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat - * and normalization with batching for YOLOX on gpus - * @param[out] dst processed image - * @param[in] src image - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] d_c channel for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] d_roi regions of interest for cropping - * @param[in] batch batch size - * @param[in] mean mean values for each channel - * @param[in] std std values for each channel - * @param[in] stream cuda stream - */ -extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, - int s_c, int batch, float * mean, float * std, cudaStream_t stream); +#include +namespace mmros::process +{ /** - * @brief Argmax on GPU - * @param[out] dst processed image - * @param[in] src probability map - * @param[in] d_w width for output - * @param[in] d_h height for output - * @param[in] s_w width for input - * @param[in] s_h height for input - * @param[in] s_c channel for input - * @param[in] batch batch size - * @param[in] stream cuda stream - */ -extern void argmax_gpu( - unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch, - cudaStream_t stream); + * Run preprocessing for image. + * + * @param input_d Pointer to input image on device. + * @param scales Mutable reference to the vector of scale factors. + * @param images Read-only reference to the vector of source images. + * @param in_width Model input width. + * @param in_height Model input height. + * @param mean Pointer to the image mean values. + * @param std Pointer to the image std values. + * @param stream CUDA stream. + */ +void preprocess_image( + float * input_d, std::vector & scales, const std::vector & images, + int64_t in_width, int64_t in_height, float * mean, float * std, cudaStream_t stream); } // namespace mmros::process #endif // MMROS__PROCESS__IMAGE_HPP_ diff --git a/mmros/include/mmros/process/image_kernel.hpp b/mmros/include/mmros/process/image_kernel.hpp new file mode 100644 index 0000000..885720b --- /dev/null +++ b/mmros/include/mmros/process/image_kernel.hpp @@ -0,0 +1,203 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MMROS__PROCESS__IMAGE_KERNEL_HPP_ +#define MMROS__PROCESS__IMAGE_KERNEL_HPP_ + +#include +#include +#include +#include +#include + +namespace mmros::process +{ +struct Roi +{ + int x; + int y; + int w; + int h; +}; + +/** + * @brief Resize a image using bilinear interpolation on gpus + * @param[out] dst Resized image + * @param[in] src image + * @param[in] d_w width for resized image + * @param[in] d_h height for resized image + * @param[in] d_c channel for resized image + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] s_c channel for input image + * @param[in] stream cuda stream + */ +extern void resize_bilinear_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief Letterbox a image on gpus + * @param[out] dst letterbox-ed image + * @param[in] src image + * @param[in] d_w width for letterbox-ing + * @param[in] d_h height for letterbox-ing + * @param[in] d_c channel for letterbox-ing + * @param[in] s_w width for input image + * @param[in] s_h height for input image + * @param[in] s_c channel for input image + * @param[in] stream cuda stream + */ +extern void letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief NHWC to NHWC conversion + * @param[out] dst converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] d_c channel for a image + * @param[in] stream cuda stream + */ +extern void nchw_to_nhwc_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); + +/** + * @brief Unsigned char to float32 for inference + * @param[out] dst32 converted image + * @param[in] src image + * @param[in] d_w width for a image + * @param[in] d_h height for a image + * @param[in] d_c channel for a image + * @param[in] stream cuda stream + */ +extern void to_float_gpu( + float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream); + +/** + * @brief Resize and letterbox a image using bilinear interpolation on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_gpu( + unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] mean mean values for each channel + * @param[in] std std values for each channel + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, + float * mean, float * std, cudaStream_t stream); + +/** + * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization + * with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] batch batch size + * @param[in] mean mean values for each channel + * @param[in] std std values for each channel + * @param[in] stream cuda stream + */ +extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, int batch, + float * mean, float * std, cudaStream_t stream); + +/** + * @brief Optimized preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat and + * normalization with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] d_roi regions of interest for cropping + * @param[in] batch batch size + * @param[in] mean mean values for each channel + * @param[in] std std values for each channel + * @param[in] stream cuda stream + */ +extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, + int s_c, int batch, float * mean, float * std, cudaStream_t stream); + +/** + * @brief Optimized multi-scale preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat + * and normalization with batching for YOLOX on gpus + * @param[out] dst processed image + * @param[in] src image + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] d_c channel for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] d_roi regions of interest for cropping + * @param[in] batch batch size + * @param[in] mean mean values for each channel + * @param[in] std std values for each channel + * @param[in] stream cuda stream + */ +extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h, + int s_c, int batch, float * mean, float * std, cudaStream_t stream); + +/** + * @brief Argmax on GPU + * @param[out] dst processed image + * @param[in] src probability map + * @param[in] d_w width for output + * @param[in] d_h height for output + * @param[in] s_w width for input + * @param[in] s_h height for input + * @param[in] s_c channel for input + * @param[in] batch batch size + * @param[in] stream cuda stream + */ +extern void argmax_gpu( + unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch, + cudaStream_t stream); +} // namespace mmros::process +#endif // MMROS__PROCESS__IMAGE_KERNEL_HPP_ diff --git a/mmros/src/detector/detector2d.cpp b/mmros/src/detector/detector2d.cpp index 76312de..f330070 100644 --- a/mmros/src/detector/detector2d.cpp +++ b/mmros/src/detector/detector2d.cpp @@ -20,18 +20,13 @@ #include "mmros/detector/utility.hpp" #include "mmros/process/image.hpp" #include "mmros/tensorrt/cuda_check_error.hpp" -#include "mmros/tensorrt/cuda_unique_ptr.hpp" #include "mmros/tensorrt/utility.hpp" #include -#include -#include -#include #include #include #include -#include #include #include #include @@ -153,41 +148,9 @@ void Detector2D::initCudaPtr(size_t batch_size) /// Execute preprocess. void Detector2D::preprocess(const std::vector & images) { - // (B, C, H, W) - const auto batch_size = images.size(); - auto in_dims = trt_common_->getTensorShape(0); - - cuda::CudaUniquePtrHost img_buf_h; - cuda::CudaUniquePtr img_buf_d; - - scales_.clear(); - for (auto b = 0; b < images.size(); ++b) { - const auto & img = images.at(b); - if (!img_buf_h) { - img_buf_h = cuda::make_unique_host( - img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined); - img_buf_d = cuda::make_unique(img.cols * img.rows * 3 * batch_size); - } - const float scale = - std::min(static_cast(in_width_) / img.cols, static_cast(in_height_) / img.rows); - scales_.emplace_back(scale); - - int index = b * img.cols * img.rows * 3; - // Copy into pinned memory - memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char)); - } - - CHECK_CUDA_ERROR( - ::cudaMemcpyAsync( - img_buf_d.get(), img_buf_h.get(), - images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), - ::cudaMemcpyHostToDevice, stream_)); - - process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3, - batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_); - - CHECK_CUDA_ERROR(cudaGetLastError()); + process::preprocess_image( + input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(), + detector_config_->std.get(), stream_); } /// Execute postprocess diff --git a/mmros/src/detector/instance_segmeter2d.cpp b/mmros/src/detector/instance_segmeter2d.cpp index a6bbf64..6c66833 100644 --- a/mmros/src/detector/instance_segmeter2d.cpp +++ b/mmros/src/detector/instance_segmeter2d.cpp @@ -19,20 +19,13 @@ #include "mmros/detector/utility.hpp" #include "mmros/process/image.hpp" #include "mmros/tensorrt/cuda_check_error.hpp" -#include "mmros/tensorrt/cuda_unique_ptr.hpp" -#include "mmros/tensorrt/tensorrt_common.hpp" #include "mmros/tensorrt/utility.hpp" #include -#include -#include #include #include -#include -#include -#include #include #include #include @@ -160,41 +153,9 @@ void InstanceSegmenter2D::initCudaPtr(size_t batch_size) void InstanceSegmenter2D::preprocess(const std::vector & images) { - // (B, C, H, W) - const auto batch_size = images.size(); - auto in_dims = trt_common_->getInputDims(0); - - cuda::CudaUniquePtrHost img_buf_h; - cuda::CudaUniquePtr img_buf_d; - - scales_.clear(); - for (auto b = 0; b < images.size(); ++b) { - const auto & img = images.at(b); - if (!img_buf_h) { - img_buf_h = cuda::make_unique_host( - img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined); - img_buf_d = cuda::make_unique(img.cols * img.rows * 3 * batch_size); - } - const float scale = - std::min(static_cast(in_width_) / img.cols, static_cast(in_height_) / img.rows); - scales_.emplace_back(scale); - - int index = b * img.cols * img.rows * 3; - // Copy into pinned memory - memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char)); - } - - CHECK_CUDA_ERROR( - ::cudaMemcpyAsync( - img_buf_d.get(), img_buf_h.get(), - images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), - ::cudaMemcpyHostToDevice, stream_)); - - process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3, - batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_); - - CHECK_CUDA_ERROR(cudaGetLastError()); + process::preprocess_image( + input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(), + detector_config_->std.get(), stream_); } archetype::Result InstanceSegmenter2D::postprocess( diff --git a/mmros/src/detector/panoptic_segmenter2d.cpp b/mmros/src/detector/panoptic_segmenter2d.cpp index c05e642..2f8a3c6 100644 --- a/mmros/src/detector/panoptic_segmenter2d.cpp +++ b/mmros/src/detector/panoptic_segmenter2d.cpp @@ -20,17 +20,12 @@ #include "mmros/detector/utility.hpp" #include "mmros/process/image.hpp" #include "mmros/tensorrt/cuda_check_error.hpp" -#include "mmros/tensorrt/cuda_unique_ptr.hpp" #include -#include -#include -#include #include #include #include -#include #include #include @@ -168,41 +163,9 @@ void PanopticSegmenter2D::initCudaPtr(size_t batch_size) void PanopticSegmenter2D::preprocess(const std::vector & images) { - // (B, C, H, W) - const auto batch_size = images.size(); - auto in_dims = trt_common_->getTensorShape(0); - - cuda::CudaUniquePtrHost img_buf_h; - cuda::CudaUniquePtr img_buf_d; - - scales_.clear(); - for (auto b = 0; b < images.size(); ++b) { - const auto & img = images.at(b); - if (!img_buf_h) { - img_buf_h = cuda::make_unique_host( - img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined); - img_buf_d = cuda::make_unique(img.cols * img.rows * 3 * batch_size); - } - const float scale = - std::min(static_cast(in_width_) / img.cols, static_cast(in_height_) / img.rows); - scales_.emplace_back(scale); - - int index = b * img.cols * img.rows * 3; - // Copy into pinned memory - memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char)); - } - - CHECK_CUDA_ERROR( - ::cudaMemcpyAsync( - img_buf_d.get(), img_buf_h.get(), - images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), - ::cudaMemcpyHostToDevice, stream_)); - - process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3, - batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_); - - CHECK_CUDA_ERROR(cudaGetLastError()); + process::preprocess_image( + input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(), + detector_config_->std.get(), stream_); } archetype::Result PanopticSegmenter2D::postprocess( diff --git a/mmros/src/detector/semantic_segmenter2d.cpp b/mmros/src/detector/semantic_segmenter2d.cpp index 4de7d67..bf184f9 100644 --- a/mmros/src/detector/semantic_segmenter2d.cpp +++ b/mmros/src/detector/semantic_segmenter2d.cpp @@ -18,7 +18,6 @@ #include "mmros/archetype/result.hpp" #include "mmros/process/image.hpp" #include "mmros/tensorrt/cuda_check_error.hpp" -#include "mmros/tensorrt/cuda_unique_ptr.hpp" #include #include @@ -26,13 +25,9 @@ #include #include -#include -#include #include #include #include -#include -#include #include #include @@ -144,41 +139,9 @@ void SemanticSegmenter2D::initCudaPtr(size_t batch_size) void SemanticSegmenter2D::preprocess(const std::vector & images) { - // (B, C, H, W) - const auto batch_size = images.size(); - auto in_dims = trt_common_->getTensorShape(0); - - cuda::CudaUniquePtrHost img_buf_h; - cuda::CudaUniquePtr img_buf_d; - - scales_.clear(); - for (auto b = 0; b < images.size(); ++b) { - const auto & img = images.at(b); - if (!img_buf_h) { - img_buf_h = cuda::make_unique_host( - img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined); - img_buf_d = cuda::make_unique(img.cols * img.rows * 3 * batch_size); - } - const float scale = - std::min(static_cast(in_width_) / img.cols, static_cast(in_height_) / img.rows); - scales_.emplace_back(scale); - - int index = b * img.cols * img.rows * 3; - // Copy into pinned memory - memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char)); - } - - CHECK_CUDA_ERROR( - ::cudaMemcpyAsync( - img_buf_d.get(), img_buf_h.get(), - images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), - ::cudaMemcpyHostToDevice, stream_)); - - process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( - input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3, - batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_); - - CHECK_CUDA_ERROR(cudaGetLastError()); + process::preprocess_image( + input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(), + detector_config_->std.get(), stream_); } archetype::Result SemanticSegmenter2D::postprocess( diff --git a/mmros/src/process/image.cpp b/mmros/src/process/image.cpp new file mode 100644 index 0000000..5f1dba8 --- /dev/null +++ b/mmros/src/process/image.cpp @@ -0,0 +1,63 @@ +// Copyright 2025 Kotaro Uetake. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mmros/process/image.hpp" + +#include "mmros/tensorrt/cuda_unique_ptr.hpp" + +#include +#include + +namespace mmros::process +{ +void preprocess_image( + float * input_d, std::vector & scales, const std::vector & images, + int64_t in_width, int64_t in_height, float * mean, float * std, cudaStream_t stream) +{ + // (B, C, H, W) + const auto batch_size = images.size(); + + cuda::CudaUniquePtrHost img_buf_h; + cuda::CudaUniquePtr img_buf_d; + + scales.clear(); + for (size_t b = 0; b < images.size(); ++b) { + const auto & img = images.at(b); + if (!img_buf_h) { + img_buf_h = cuda::make_unique_host( + img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined); + img_buf_d = cuda::make_unique(img.cols * img.rows * 3 * batch_size); + } + const float scale = + std::min(static_cast(in_width) / img.cols, static_cast(in_height) / img.rows); + scales.emplace_back(scale); + + size_t index = b * img.cols * img.rows * 3; + // Copy into pinned memory + memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char)); + } + + CHECK_CUDA_ERROR( + ::cudaMemcpyAsync( + img_buf_d.get(), img_buf_h.get(), + images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char), + ::cudaMemcpyHostToDevice, stream)); + + process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu( + input_d, img_buf_d.get(), in_width, in_height, 3, images[0].cols, images[0].rows, 3, batch_size, + mean, std, stream); + + CHECK_CUDA_ERROR(cudaGetLastError()); +} +} // namespace mmros::process diff --git a/mmros/src/process/image.cu b/mmros/src/process/image_kernel.cu similarity index 99% rename from mmros/src/process/image.cu rename to mmros/src/process/image_kernel.cu index 24c41ee..dd5e296 100644 --- a/mmros/src/process/image.cu +++ b/mmros/src/process/image_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mmros/process/image.hpp" +#include "mmros/process/image_kernel.hpp" #include #include