diff --git a/perception/autoware_bevfusion/CMakeLists.txt b/perception/autoware_bevfusion/CMakeLists.txt index d5e87aeff3399..01e1e79731b19 100644 --- a/perception/autoware_bevfusion/CMakeLists.txt +++ b/perception/autoware_bevfusion/CMakeLists.txt @@ -101,6 +101,7 @@ if(TRT_AVAIL AND CUDA_AVAIL AND SPCONV_AVAIL) lib/postprocess/circle_nms_kernel.cu lib/postprocess/postprocess_kernel.cu lib/preprocess/preprocess_kernel.cu + lib/camera/camera_preprocess_kernel.cu ) target_link_libraries(${PROJECT_NAME}_cuda_lib @@ -120,6 +121,8 @@ if(TRT_AVAIL AND CUDA_AVAIL AND SPCONV_AVAIL) lib/preprocess/precomputed_features.cpp lib/ros_utils.cpp lib/bevfusion_trt.cpp + lib/camera/camera_matrices.cpp + lib/camera/camera_data.cpp ) target_compile_definitions(${PROJECT_NAME}_lib PRIVATE diff --git a/perception/autoware_bevfusion/config/bevfusion_camera_lidar.param.yaml b/perception/autoware_bevfusion/config/bevfusion_camera_lidar.param.yaml index feddd816165bd..fb10df36131c4 100644 --- a/perception/autoware_bevfusion/config/bevfusion_camera_lidar.param.yaml +++ b/perception/autoware_bevfusion/config/bevfusion_camera_lidar.param.yaml @@ -18,6 +18,8 @@ # pre-process params densification_num_past_frames: 1 densification_world_frame_id: map + run_image_undistortion: true + # post-process params circle_nms_dist_threshold: 0.5 iou_nms_search_distance_2d: 10.0 diff --git a/perception/autoware_bevfusion/config/bevfusion_lidar.param.yaml b/perception/autoware_bevfusion/config/bevfusion_lidar.param.yaml index 7d062cbd71e2d..9039aca4f99a9 100644 --- a/perception/autoware_bevfusion/config/bevfusion_lidar.param.yaml +++ b/perception/autoware_bevfusion/config/bevfusion_lidar.param.yaml @@ -18,6 +18,8 @@ # pre-process params densification_num_past_frames: 1 densification_world_frame_id: map + run_image_undistortion: false + # post-process params circle_nms_dist_threshold: 0.5 iou_nms_search_distance_2d: 10.0 diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_node.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_node.hpp index 048bb7fff827a..b520c804b61f7 100644 --- a/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_node.hpp +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_node.hpp @@ -16,6 +16,7 @@ #define AUTOWARE__BEVFUSION__BEVFUSION_NODE_HPP_ #include "autoware/bevfusion/bevfusion_trt.hpp" +#include "autoware/bevfusion/camera/camera_data.hpp" #include "autoware/bevfusion/detection_class_remapper.hpp" #include "autoware/bevfusion/postprocess/non_maximum_suppression.hpp" #include "autoware/bevfusion/preprocess/pointcloud_densification.hpp" @@ -67,7 +68,8 @@ class BEVFUSION_PUBLIC BEVFusionNode : public rclcpp::Node void diagnoseProcessingTime(diagnostic_updater::DiagnosticStatusWrapper & stat); // Helper methods for constructor - void initializeSensorFusionSubscribers(std::int64_t num_cameras); + void initializeSensorFusionSubscribers( + std::int64_t num_cameras, const ImagePreProcessingParams & image_pre_processing_params); void validateParameters( const std::vector & point_cloud_range, const std::vector & voxel_size); @@ -100,6 +102,11 @@ class BEVFUSION_PUBLIC BEVFusionNode : public rclcpp::Node std::vector::ConstSharedPtr> camera_info_subs_; rclcpp::Publisher::SharedPtr objects_pub_{ nullptr}; + // unique_ptr to avoid copying the actual camera data in memory since there's gpu buffer in the + // camera data + std::vector> camera_data_ptrs_; + // One CameraMatrices object can be shared by several CameraData + std::vector> camera_matrices_ptrs_; tf2_ros::Buffer tf_buffer_; tf2_ros::TransformListener tf_listener_{tf_buffer_}; @@ -109,9 +116,7 @@ class BEVFUSION_PUBLIC BEVFusionNode : public rclcpp::Node std::optional lidar_frame_; float max_camera_lidar_delay_; - std::vector image_msgs_; std::vector camera_masks_; - std::vector> camera_info_msgs_; std::vector> lidar2camera_extrinsics_; bool sensor_fusion_{false}; diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_trt.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_trt.hpp index 56f40655f1e0b..df6e44ea3b196 100644 --- a/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_trt.hpp +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/bevfusion_trt.hpp @@ -15,6 +15,7 @@ #ifndef AUTOWARE__BEVFUSION__BEVFUSION_TRT_HPP_ #define AUTOWARE__BEVFUSION__BEVFUSION_TRT_HPP_ +#include "autoware/bevfusion/camera/camera_data.hpp" #include "autoware/bevfusion/postprocess/postprocess_kernel.hpp" #include "autoware/bevfusion/preprocess/pointcloud_densification.hpp" #include "autoware/bevfusion/preprocess/preprocess_kernel.hpp" @@ -87,7 +88,7 @@ class BEVFUSION_PUBLIC BEVFusionTRT bool detect( const std::shared_ptr & input_pointcloud_msg_ptr, - const std::vector & image_msgs, + const std::vector> & camera_data_ptrs, const std::vector & camera_masks, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d, std::unordered_map & proc_timing, bool & is_num_voxels_within_range); @@ -102,7 +103,7 @@ class BEVFUSION_PUBLIC BEVFusionTRT bool preProcess( const std::shared_ptr & pc_msg_ptr, - const std::vector & image_msgs, + const std::vector> & camera_data_ptrs, const std::vector & camera_masks, const tf2_ros::Buffer & tf_buffer, bool & is_num_voxels_within_range); @@ -111,8 +112,11 @@ class BEVFUSION_PUBLIC BEVFusionTRT void clearDeviceMemory(); - void processImages( - const std::vector & image_msgs, + bool checkImageCameraMatricesReady( + const std::vector> & camera_data_ptrs); + + bool processImages( + const std::vector> & camera_data_ptrs, const std::vector & camera_masks); std::int64_t processPointCloudVoxelization( @@ -140,7 +144,6 @@ class BEVFUSION_PUBLIC BEVFusionTRT std::unique_ptr pre_ptr_{nullptr}; std::unique_ptr post_ptr_{nullptr}; cudaStream_t stream_{nullptr}; - std::vector camera_streams_{}; BEVFusionConfig config_; std::vector roi_start_y_vector_; @@ -171,7 +174,6 @@ class BEVFUSION_PUBLIC BEVFusionTRT // image buffers CudaUniquePtr roi_tensor_d_{nullptr}; - std::vector> image_buffers_d_{}; CudaUniquePtr camera_masks_d_{nullptr}; // image feature buffers for fusion model diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_data.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_data.hpp new file mode 100644 index 0000000000000..1262571941251 --- /dev/null +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_data.hpp @@ -0,0 +1,104 @@ +// Copyright 2026 TIER IV +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__BEVFUSION__CAMERA__CAMERA_DATA_HPP_ +#define AUTOWARE__BEVFUSION__CAMERA__CAMERA_DATA_HPP_ + +#include "autoware/bevfusion/camera/camera_matrices.hpp" +#include "autoware/bevfusion/camera/camera_preprocess.hpp" + +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace autoware::bevfusion +{ +using autoware::cuda_utils::CudaUniquePtr; + +struct ImagePreProcessingParams +{ + std::int64_t original_image_height; + std::int64_t original_image_width; + std::int64_t roi_height; + std::int64_t roi_width; + float image_aug_scale_y; + float image_aug_scale_x; + bool run_image_undistortion; + std::int64_t crop_height; + std::int64_t crop_width; + std::int64_t resized_height; + std::int64_t resized_width; + std::int64_t roi_start_y; + std::int64_t roi_start_x; + + ImagePreProcessingParams( + const std::int64_t original_image_height, const std::int64_t original_image_width, + const std::int64_t roi_height, const std::int64_t roi_width, const float image_aug_scale_y, + const float image_aug_scale_x, const bool run_image_undistortion); +}; + +class CameraData +{ +public: + CameraData( + rclcpp::Node * node, const int camera_id, + const ImagePreProcessingParams & image_pre_processing_params, + const std::shared_ptr & camera_matrices_ptr); + ~CameraData(); + + void update_image_msg(const sensor_msgs::msg::Image::ConstSharedPtr & input_camera_image_msg); + void update_camera_info(const sensor_msgs::msg::CameraInfo & input_camera_info_msg); + std::optional camera_info() const; + sensor_msgs::msg::CameraInfo camera_info_value() const; + + sensor_msgs::msg::Image::ConstSharedPtr image_msg() const; + bool is_image_msg_available() const; + bool is_camera_info_available() const; + std::size_t output_img_offset() const; + + bool preprocess_image(std::uint8_t * output_img); + bool is_camera_matrices_ready() const; + cudaError_t sync_cuda_stream(); + +private: + // Camera info buffer + std::optional camera_info_; + + // Image buffer + // TODO(KokSeang): Remove this once we move preprocessing to subscribers + sensor_msgs::msg::Image::ConstSharedPtr image_msg_; + + // GPU Memory for preprocessed image + // image buffers + CudaUniquePtr image_buffer_d_{nullptr}; + CudaUniquePtr undistorted_image_buffer_d_{nullptr}; + + rclcpp::Logger logger_; + int camera_id_; + ImagePreProcessingParams image_pre_processing_params_; + std::size_t output_img_offset_; + + cudaStream_t stream_{nullptr}; + std::unique_ptr camera_preprocess_ptr_{nullptr}; + std::shared_ptr camera_matrices_ptr_{nullptr}; +}; +} // namespace autoware::bevfusion +#endif // AUTOWARE__BEVFUSION__CAMERA__CAMERA_DATA_HPP_ diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_matrices.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_matrices.hpp new file mode 100644 index 0000000000000..0b895aea2252b --- /dev/null +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_matrices.hpp @@ -0,0 +1,46 @@ +// Copyright 2026 TIER IV +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__BEVFUSION__CAMERA__CAMERA_MATRICES_HPP_ +#define AUTOWARE__BEVFUSION__CAMERA__CAMERA_MATRICES_HPP_ + +#include +#include + +#include + +namespace autoware::bevfusion +{ +using autoware::cuda_utils::CudaUniquePtr; + +// Helper struct for camera matrices +struct CameraMatrices +{ + cv::Mat K; + cv::Mat D; + cv::Mat P; + int map_width; + int map_height; + bool matrices_ready{false}; + + // GPU Memory for undistortion maps and they only need to be computed once for each camera + CudaUniquePtr undistorted_map_x_d_{nullptr}; + CudaUniquePtr undistorted_map_y_d_{nullptr}; + + CameraMatrices(); + void update_camera_matrices(const sensor_msgs::msg::CameraInfo & camera_info); + void compute_undistorted_map_x_y(); +}; +} // namespace autoware::bevfusion +#endif // AUTOWARE__BEVFUSION__CAMERA__CAMERA_MATRICES_HPP_ diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_preprocess.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_preprocess.hpp new file mode 100644 index 0000000000000..70e30ff2962f5 --- /dev/null +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/camera/camera_preprocess.hpp @@ -0,0 +1,50 @@ +// Copyright 2026 TIER IV +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__BEVFUSION__CAMERA__CAMERA_PREPROCESS_HPP_ +#define AUTOWARE__BEVFUSION__CAMERA__CAMERA_PREPROCESS_HPP_ + +#include "autoware/bevfusion/utils.hpp" + +#include + +#include + +namespace autoware::bevfusion +{ + +class CameraPreprocess +{ +public: + CameraPreprocess(cudaStream_t stream, const int camera_id); + + cudaError_t resizeAndExtractRoi_launch( + const std::uint8_t * input_img, std::uint8_t * output_img, int H, + int W, // Original image dimensions + int H2, int W2, // Resized image dimensions + int H3, int W3, // ROI dimensions + int y_start, int x_start // ROI top-left coordinates in resized image + ); + cudaError_t remap_launch( + const std::uint8_t * input_img, std::uint8_t * output_img, int output_height, + int output_width, // Output (destination) image dimensions + int input_height, int input_width, // Input (source) image dimensions + const float * map_x, const float * map_y); + +private: + cudaStream_t stream_; + int camera_id_; +}; +} // namespace autoware::bevfusion +#endif // AUTOWARE__BEVFUSION__CAMERA__CAMERA_PREPROCESS_HPP_ diff --git a/perception/autoware_bevfusion/include/autoware/bevfusion/preprocess/preprocess_kernel.hpp b/perception/autoware_bevfusion/include/autoware/bevfusion/preprocess/preprocess_kernel.hpp index ad1dcdfde11a6..22d9c06a5bd96 100644 --- a/perception/autoware_bevfusion/include/autoware/bevfusion/preprocess/preprocess_kernel.hpp +++ b/perception/autoware_bevfusion/include/autoware/bevfusion/preprocess/preprocess_kernel.hpp @@ -59,10 +59,6 @@ class PreprocessCuda const float * points, unsigned int points_size, float * voxel_features, std::int32_t * voxel_coords, std::int32_t * num_points_per_voxel); - cudaError_t resizeAndExtractRoi_launch( - const std::uint8_t * input_img, std::uint8_t * output_img, int H, int W, int H2, int W2, int H3, - int W3, int y_start, int x_start, cudaStream_t stream); - private: BEVFusionConfig config_; cudaStream_t stream_; diff --git a/perception/autoware_bevfusion/launch/bevfusion.launch.xml b/perception/autoware_bevfusion/launch/bevfusion.launch.xml index 01bddca4b1082..5b130423fde60 100644 --- a/perception/autoware_bevfusion/launch/bevfusion.launch.xml +++ b/perception/autoware_bevfusion/launch/bevfusion.launch.xml @@ -74,18 +74,18 @@ - - - - - - - - - - - - + + + + + + + + + + + + diff --git a/perception/autoware_bevfusion/lib/bevfusion_trt.cpp b/perception/autoware_bevfusion/lib/bevfusion_trt.cpp index 620e162ee1fe5..dc34c99cbfef9 100644 --- a/perception/autoware_bevfusion/lib/bevfusion_trt.cpp +++ b/perception/autoware_bevfusion/lib/bevfusion_trt.cpp @@ -45,10 +45,6 @@ BEVFusionTRT::BEVFusionTRT( { // Create and init cuda streams CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); - camera_streams_.resize(config_.num_cameras_); - for (std::int64_t i = 0; i < config_.num_cameras_; i++) { - CHECK_CUDA_ERROR(cudaStreamCreate(&camera_streams_[i])); - } vg_ptr_ = std::make_unique(densification_param, config_, stream_); stop_watch_ptr_ = @@ -65,13 +61,6 @@ BEVFusionTRT::~BEVFusionTRT() cudaStreamSynchronize(stream_); cudaStreamDestroy(stream_); } - - for (std::int64_t i = 0; i < config_.num_cameras_; i++) { - if (camera_streams_[i]) { - cudaStreamSynchronize(camera_streams_[i]); - cudaStreamDestroy(camera_streams_[i]); - } - } } void BEVFusionTRT::initPtr() @@ -112,11 +101,6 @@ void BEVFusionTRT::initPtr() roi_tensor_d_ = autoware::cuda_utils::make_unique( config_.num_cameras_ * config_.roi_height_ * config_.roi_width_ * BEVFusionConfig::kNumRGBChannels); - for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) { - image_buffers_d_.emplace_back( - autoware::cuda_utils::make_unique( - config_.raw_image_height_ * config_.raw_image_width_ * BEVFusionConfig::kNumRGBChannels)); - } camera_masks_d_ = autoware::cuda_utils::make_unique(config_.num_cameras_); // buffers for fusion model with separate image backbone @@ -368,13 +352,14 @@ void BEVFusionTRT::setSensorFusionTensorAddresses() bool BEVFusionTRT::detect( const std::shared_ptr & pc_msg_ptr, - const std::vector & image_msgs, + const std::vector> & camera_data_ptrs, const std::vector & camera_masks, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d, std::unordered_map & proc_timing, bool & is_num_voxels_within_range) { stop_watch_ptr_->toc("processing/inner", true); - if (!preProcess(pc_msg_ptr, image_msgs, camera_masks, tf_buffer, is_num_voxels_within_range)) { + if (!preProcess( + pc_msg_ptr, camera_data_ptrs, camera_masks, tf_buffer, is_num_voxels_within_range)) { RCLCPP_ERROR(rclcpp::get_logger("bevfusion"), "Pre-process failed. Skipping detection."); return false; } @@ -454,19 +439,20 @@ void BEVFusionTRT::setIntrinsicsExtrinsics( assert(num_geom_feats_ == BEVFusionConfig::kTransformMatrixDim * num_ranks_); assert(num_ranks_ == num_indices_); - cudaMemcpy( + CHECK_CUDA_ERROR(cudaMemcpy( lidar2image_d_.get(), lidar2images_flattened.data(), config_.num_cameras_ * BEVFusionConfig::kTransformMatrixDim * BEVFusionConfig::kTransformMatrixDim * sizeof(float), - cudaMemcpyHostToDevice); - cudaMemcpy( + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( geom_feats_d_.get(), geom_feats.data(), num_geom_feats_ * sizeof(std::int32_t), - cudaMemcpyHostToDevice); - cudaMemcpy(kept_d_.get(), kept.data(), num_kept_ * sizeof(std::uint8_t), cudaMemcpyHostToDevice); - cudaMemcpy( - ranks_d_.get(), ranks.data(), num_ranks_ * sizeof(std::int64_t), cudaMemcpyHostToDevice); - cudaMemcpy( - indices_d_.get(), indices.data(), num_indices_ * sizeof(std::int64_t), cudaMemcpyHostToDevice); + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + kept_d_.get(), kept.data(), num_kept_ * sizeof(std::uint8_t), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + ranks_d_.get(), ranks.data(), num_ranks_ * sizeof(std::int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + indices_d_.get(), indices.data(), num_indices_ * sizeof(std::int64_t), cudaMemcpyHostToDevice)); // Copy img_aug_matrix data for fusion model (fusion model always uses separate image backbone) // Each Matrix4fRowM is contiguous, copy each matrix directly to its position in device memory @@ -475,9 +461,9 @@ void BEVFusionTRT::setIntrinsicsExtrinsics( BEVFusionConfig::kTransformMatrixDim * BEVFusionConfig::kTransformMatrixDim; for (std::int64_t i = 0; i < config_.num_cameras_; i++) { - cudaMemcpy( + CHECK_CUDA_ERROR(cudaMemcpy( img_aug_matrix_d_.get() + i * matrix_size, img_aug_matrices_[i].data(), - matrix_size * sizeof(float), cudaMemcpyHostToDevice); + matrix_size * sizeof(float), cudaMemcpyHostToDevice)); } } } @@ -514,42 +500,36 @@ void BEVFusionTRT::clearDeviceMemory() CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); } -void BEVFusionTRT::processImages( - const std::vector & image_msgs, - const std::vector & camera_masks) +bool BEVFusionTRT::checkImageCameraMatricesReady( + const std::vector> & camera_data_ptrs) { - if (!config_.sensor_fusion_) { - return; - } - - // TODO(knzo25): move this to each image callback - const int start_x = - std::max(0, static_cast(config_.resized_width_) - static_cast(config_.roi_width_)) / - 2; - for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) { - const int start_y = roi_start_y_vector_[camera_id]; - cudaMemcpyAsync( - image_buffers_d_[camera_id].get(), image_msgs[camera_id]->data.data(), - config_.raw_image_height_ * config_.raw_image_width_ * BEVFusionConfig::kNumRGBChannels, - cudaMemcpyHostToDevice, stream_); - - pre_ptr_->resizeAndExtractRoi_launch( - image_buffers_d_[camera_id].get(), - &roi_tensor_d_ - [camera_id * config_.roi_height_ * config_.roi_width_ * BEVFusionConfig::kNumRGBChannels], - config_.raw_image_height_, config_.raw_image_width_, config_.resized_height_, - config_.resized_width_, config_.roi_height_, config_.roi_width_, start_y, start_x, - camera_streams_[camera_id]); + if (!camera_data_ptrs[camera_id]->is_camera_matrices_ready()) { + return false; + } } + return true; +} - for (std::int64_t i = 0; i < config_.num_cameras_; i++) { - cudaStreamSynchronize(camera_streams_[i]); +bool BEVFusionTRT::processImages( + const std::vector> & camera_data_ptrs, + const std::vector & camera_masks) +{ + for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) { + auto roi_tensor_offset = camera_data_ptrs[camera_id]->output_img_offset(); + if (!camera_data_ptrs[camera_id]->preprocess_image(&roi_tensor_d_[roi_tensor_offset])) { + return false; + } } - cudaMemcpyAsync( + CHECK_CUDA_ERROR(cudaMemcpyAsync( camera_masks_d_.get(), camera_masks.data(), config_.num_cameras_ * sizeof(float), - cudaMemcpyHostToDevice, stream_); + cudaMemcpyHostToDevice, stream_)); + + for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) { + CHECK_CUDA_ERROR(camera_data_ptrs[camera_id]->sync_cuda_stream()); + } + return true; } std::int64_t BEVFusionTRT::processPointCloudVoxelization( @@ -628,7 +608,7 @@ void BEVFusionTRT::configureTensorRTInputs(std::int64_t num_voxels, std::size_t bool BEVFusionTRT::preProcess( const std::shared_ptr & pc_msg_ptr, - const std::vector & image_msgs, + const std::vector> & camera_data_ptrs, const std::vector & camera_masks, const tf2_ros::Buffer & tf_buffer, bool & is_num_voxels_within_range) { @@ -643,7 +623,24 @@ bool BEVFusionTRT::preProcess( } clearDeviceMemory(); - processImages(image_msgs, camera_masks); + + // Process images if sensor fusion is enabled + if (config_.sensor_fusion_) { + // Check if image camera matrices are ready + if (!checkImageCameraMatricesReady(camera_data_ptrs)) { + RCLCPP_ERROR( + rclcpp::get_logger("bevfusion"), + "Image camera matrices are not ready. Skipping pre-processing."); + return false; + } + + // Process Images + if (!processImages(camera_data_ptrs, camera_masks)) { + RCLCPP_ERROR( + rclcpp::get_logger("bevfusion"), "Failed to process images. Skipping detection."); + return false; + } + } const auto num_points = vg_ptr_->generateSweepPoints(points_d_); if (num_points == 0) { @@ -668,12 +665,12 @@ bool BEVFusionTRT::preProcess( std::vector roi_host_data( config_.roi_height_ * config_.roi_width_ * BEVFusionConfig::kNumRGBChannels); for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) { - cudaMemcpy( + CHECK_CUDA_ERROR(cudaMemcpy( roi_host_data.data(), &roi_tensor_d_ [camera_id * config_.roi_height_ * config_.roi_width_ * BEVFusionConfig::kNumRGBChannels], config_.roi_height_ * config_.roi_width_ * BEVFusionConfig::kNumRGBChannels, - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost)); } } diff --git a/perception/autoware_bevfusion/lib/camera/camera_data.cpp b/perception/autoware_bevfusion/lib/camera/camera_data.cpp new file mode 100644 index 0000000000000..648f1e595c430 --- /dev/null +++ b/perception/autoware_bevfusion/lib/camera/camera_data.cpp @@ -0,0 +1,217 @@ +// Copyright 2026 TIER IV +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/bevfusion/camera/camera_data.hpp" + +#include "autoware/bevfusion/bevfusion_config.hpp" +#include "autoware/bevfusion/camera/camera_preprocess.hpp" + +#include +#include + +namespace autoware::bevfusion +{ + +ImagePreProcessingParams::ImagePreProcessingParams( + const std::int64_t image_height, const std::int64_t image_width, const std::int64_t roi_height, + const std::int64_t roi_width, const float image_aug_scale_y, const float image_aug_scale_x, + const bool run_image_undistortion) +: original_image_height(image_height), + original_image_width(image_width), + roi_height(roi_height), + roi_width(roi_width), + image_aug_scale_y(image_aug_scale_y), + image_aug_scale_x(image_aug_scale_x), + run_image_undistortion(run_image_undistortion) +{ + resized_height = original_image_height * image_aug_scale_y; + resized_width = original_image_width * image_aug_scale_x; + crop_height = resized_height - roi_height; + crop_width = std::max( + static_cast(0), static_cast((resized_width - roi_width) / 2)); + + roi_start_x = std::max(static_cast(0), crop_width); + roi_start_y = std::max(static_cast(0), crop_height); +} + +CameraData::CameraData( + rclcpp::Node * node, const int camera_id, + const ImagePreProcessingParams & image_pre_processing_params, + const std::shared_ptr & camera_matrices_ptr) +: logger_(node->get_logger()), + camera_id_(camera_id), + image_pre_processing_params_(image_pre_processing_params), + camera_matrices_ptr_(camera_matrices_ptr) +{ + // Initialization cuda stream + cudaStreamCreate(&stream_); + + camera_preprocess_ptr_ = std::make_unique(stream_, camera_id_); + image_buffer_d_ = autoware::cuda_utils::make_unique( + image_pre_processing_params_.original_image_height * + image_pre_processing_params_.original_image_width * BEVFusionConfig::kNumRGBChannels); + + if (image_pre_processing_params_.run_image_undistortion) { + undistorted_image_buffer_d_ = autoware::cuda_utils::make_unique( + image_pre_processing_params_.original_image_height * + image_pre_processing_params_.original_image_width * BEVFusionConfig::kNumRGBChannels); + } + + output_img_offset_ = camera_id_ * BEVFusionConfig::kNumRGBChannels * + image_pre_processing_params_.roi_height * + image_pre_processing_params_.roi_width; +} + +CameraData::~CameraData() +{ + if (stream_ != nullptr) { + // 1. Wait for ALL pending GPU work to finish + cudaStreamSynchronize(stream_); + + // 2. Destroy the stream handle + cudaStreamDestroy(stream_); + + // 3. Mark it as null + stream_ = nullptr; + } +} + +void CameraData::update_image_msg( + const sensor_msgs::msg::Image::ConstSharedPtr & input_camera_image_msg) +{ + // Update image message + image_msg_ = input_camera_image_msg; +} + +void CameraData::update_camera_info(const sensor_msgs::msg::CameraInfo & camera_info_msg) +{ + // Update camera info message + camera_info_ = camera_info_msg; + + // Dont need to compute matrices and undistortion if it's not enabled + if (!image_pre_processing_params_.run_image_undistortion) { + return; + } + + // Check if undistorted map x and y are already computed + if ( + camera_matrices_ptr_->undistorted_map_x_d_ != nullptr && + camera_matrices_ptr_->undistorted_map_y_d_ != nullptr) { + return; + } + + // Update camera matrices + camera_matrices_ptr_->update_camera_matrices(camera_info_msg); + + // Compute undistorted map x and y + camera_matrices_ptr_->compute_undistorted_map_x_y(); +} + +bool CameraData::is_image_msg_available() const +{ + return image_msg_ != nullptr; +} + +bool CameraData::is_camera_info_available() const +{ + return camera_info_.has_value(); +} + +std::optional CameraData::camera_info() const +{ + return camera_info_; +} + +sensor_msgs::msg::CameraInfo CameraData::camera_info_value() const +{ + return *(camera_info_); +} + +sensor_msgs::msg::Image::ConstSharedPtr CameraData::image_msg() const +{ + return image_msg_; +} + +std::size_t CameraData::output_img_offset() const +{ + return output_img_offset_; +} + +bool CameraData::preprocess_image(std::uint8_t * output_img) +{ + // 1. Copy image from CPU to GPU + CHECK_CUDA_ERROR(cudaMemcpyAsync( + image_buffer_d_.get(), image_msg_->data.data(), + image_pre_processing_params_.original_image_height * + image_pre_processing_params_.original_image_width * BEVFusionConfig::kNumRGBChannels, + cudaMemcpyHostToDevice, stream_)); + + if (image_pre_processing_params_.run_image_undistortion) { + // 1. Check if original image shape is the same as the original image shape + if ( + image_msg_->height != image_pre_processing_params_.original_image_height || + image_msg_->width != image_pre_processing_params_.original_image_width) { + RCLCPP_ERROR(logger_, "Input image shape is not the same as the original image shape"); + return false; + } + + // 2. Launch remap kernel for undistortion + camera_preprocess_ptr_->remap_launch( + image_buffer_d_.get(), undistorted_image_buffer_d_.get(), + // Output dimensions (full resolution) + image_pre_processing_params_.original_image_height, + image_pre_processing_params_.original_image_width, + // Input dimensions (full resolution) + image_msg_->height, image_msg_->width, + // Undistorted map x and y + camera_matrices_ptr_->undistorted_map_x_d_.get(), + camera_matrices_ptr_->undistorted_map_y_d_.get()); + + // 3. Resize and extract ROI, and then saving to output_img + // output_img is expected to be in the size of roi_height * roi_width * image channels (usually, + camera_preprocess_ptr_->resizeAndExtractRoi_launch( + undistorted_image_buffer_d_.get(), output_img, + image_pre_processing_params_.original_image_height, + image_pre_processing_params_.original_image_width, + image_pre_processing_params_.resized_height, image_pre_processing_params_.resized_width, + image_pre_processing_params_.roi_height, image_pre_processing_params_.roi_width, + image_pre_processing_params_.roi_start_y, image_pre_processing_params_.roi_start_x); + } else { + // Skip the undistortion step and directly resize and extract ROI + camera_preprocess_ptr_->resizeAndExtractRoi_launch( + image_buffer_d_.get(), output_img, image_pre_processing_params_.original_image_height, + image_pre_processing_params_.original_image_width, + image_pre_processing_params_.resized_height, image_pre_processing_params_.resized_width, + image_pre_processing_params_.roi_height, image_pre_processing_params_.roi_width, + image_pre_processing_params_.roi_start_y, image_pre_processing_params_.roi_start_x); + } + + return true; +} + +bool CameraData::is_camera_matrices_ready() const +{ + if (!image_pre_processing_params_.run_image_undistortion) { + return true; + } else { + return camera_matrices_ptr_->matrices_ready; + } +} + +cudaError_t CameraData::sync_cuda_stream() +{ + return cudaStreamSynchronize(stream_); +} + +} // namespace autoware::bevfusion diff --git a/perception/autoware_bevfusion/lib/camera/camera_matrices.cpp b/perception/autoware_bevfusion/lib/camera/camera_matrices.cpp new file mode 100644 index 0000000000000..1fde831a80f50 --- /dev/null +++ b/perception/autoware_bevfusion/lib/camera/camera_matrices.cpp @@ -0,0 +1,81 @@ +// Copyright 2026 TIER IV +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware/bevfusion/camera/camera_matrices.hpp" + +#include + +namespace autoware::bevfusion +{ + +CameraMatrices::CameraMatrices() +{ + K = cv::Mat::zeros(3, 3, CV_64F); + D = cv::Mat::zeros(1, 5, CV_64F); + P = cv::Mat::zeros(3, 4, CV_64F); + map_width = 0; + map_height = 0; + + matrices_ready = false; +} + +void CameraMatrices::update_camera_matrices(const sensor_msgs::msg::CameraInfo & camera_info) +{ + // Create camera matrix K from camera_info + K = + (cv::Mat_(3, 3) << camera_info.k[0], camera_info.k[1], camera_info.k[2], + camera_info.k[3], camera_info.k[4], camera_info.k[5], camera_info.k[6], camera_info.k[7], + camera_info.k[8]); + + // Create distortion coefficients matrix D from camera_info + const auto & d_vec = camera_info.d; + D = cv::Mat(1, static_cast(d_vec.size()), CV_64F); + for (size_t i = 0; i < d_vec.size(); ++i) { + D.at(0, static_cast(i)) = d_vec[i]; + } + + // Create projection matrix P from camera_info (first 3x3 part) + P = + (cv::Mat_(3, 3) << camera_info.p[0], camera_info.p[1], camera_info.p[2], + camera_info.p[4], camera_info.p[5], camera_info.p[6], camera_info.p[8], camera_info.p[9], + camera_info.p[10]); + + // Use full resolution for both input and output (no downsampling) + map_width = camera_info.width; + map_height = camera_info.height; +} + +void CameraMatrices::compute_undistorted_map_x_y() +{ + undistorted_map_x_d_ = autoware::cuda_utils::make_unique(map_width * map_height); + undistorted_map_y_d_ = autoware::cuda_utils::make_unique(map_width * map_height); + + // Compute undistortion maps for full resolution + cv::Mat undistort_map_x, undistort_map_y; + cv::initUndistortRectifyMap( + K, D, cv::Mat(), P, cv::Size(map_width, map_height), CV_32FC1, undistort_map_x, + undistort_map_y); + + // Always sync the data to the GPU since it only runs once for every camera + CHECK_CUDA_ERROR(cudaMemcpy( + undistorted_map_x_d_.get(), undistort_map_x.data, map_width * map_height * sizeof(float), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + undistorted_map_y_d_.get(), undistort_map_y.data, map_width * map_height * sizeof(float), + cudaMemcpyHostToDevice)); + + matrices_ready = true; +} + +} // namespace autoware::bevfusion diff --git a/perception/autoware_bevfusion/lib/camera/camera_preprocess_kernel.cu b/perception/autoware_bevfusion/lib/camera/camera_preprocess_kernel.cu new file mode 100644 index 0000000000000..f416fd5b2c477 --- /dev/null +++ b/perception/autoware_bevfusion/lib/camera/camera_preprocess_kernel.cu @@ -0,0 +1,268 @@ +// Copyright 2025 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "autoware/bevfusion/camera/camera_preprocess.hpp" + +#include + +#include +#include + +namespace autoware::bevfusion +{ + +CameraPreprocess::CameraPreprocess(cudaStream_t stream, const int camera_id) +: stream_(stream), camera_id_(camera_id) +{ +} + +__global__ void remap_kernel( + const std::uint8_t * __restrict__ input_img, std::uint8_t * __restrict__ output_img, + int output_height, int output_width, // Output (destination) image dimensions + int input_height, int input_width, // Input (source) image dimensions + const float * __restrict__ map_x, const float * __restrict__ map_y) +{ + // Calculate the global thread indices for output image + int x = blockIdx.x * blockDim.x + threadIdx.x; // Width index in output + int y = blockIdx.y * blockDim.y + threadIdx.y; // Height index in output + + // Check if the thread corresponds to a valid pixel in output + if (x >= output_width || y >= output_height) return; + + // Get the mapping coordinates for this output pixel. + // Skip the first y rows with (y * output_width) and then skip the first x columns by adding x + int map_idx = y * output_width + x; + float src_x = map_x[map_idx]; + float src_y = map_y[map_idx]; + + // Check if the mapped coordinates are valid in the input image + // Skip the first y rows with (y * output_width) and then skip the first x columns by adding x. + // Then skip the first 3 channels by multiplying by 3 + int out_idx = (y * output_width + x) * 3; + if (src_x < 0 || src_y < 0 || src_x >= input_width || src_y >= input_height) { + // Set to black for out-of-bounds pixels + output_img[out_idx] = 0; + output_img[out_idx + 1] = 0; + output_img[out_idx + 2] = 0; + return; + } + + // Bilinear interpolation in unit square (0,0), (0,1), (1,0), (1,1) + int x0 = static_cast(floorf(src_x)); + int y0 = static_cast(floorf(src_y)); + int x1 = x0 + 1; + int y1 = y0 + 1; + + float dx = src_x - x0; + float dy = src_y - y0; + + // Calculate interpolation weights + float w00 = (1.0f - dx) * (1.0f - dy); + float w01 = (1.0f - dx) * dy; + float w10 = dx * (1.0f - dy); + float w11 = dx * dy; + +// Process each color channel +#pragma unroll + for (int c = 0; c < 3; ++c) { + float v00 = 0.0f, v01 = 0.0f, v10 = 0.0f, v11 = 0.0f; + + // Get pixel values with boundary checks + if (x0 >= 0 && x0 < input_width && y0 >= 0 && y0 < input_height) + v00 = static_cast(input_img[(y0 * input_width + x0) * 3 + c]); + if (x0 >= 0 && x0 < input_width && y1 >= 0 && y1 < input_height) + v01 = static_cast(input_img[(y1 * input_width + x0) * 3 + c]); + if (x1 >= 0 && x1 < input_width && y0 >= 0 && y0 < input_height) + v10 = static_cast(input_img[(y0 * input_width + x1) * 3 + c]); + if (x1 >= 0 && x1 < input_width && y1 >= 0 && y1 < input_height) + v11 = static_cast(input_img[(y1 * input_width + x1) * 3 + c]); + + // Interpolate and store the result + float interpolated_value = w00 * v00 + w01 * v01 + w10 * v10 + w11 * v11; + // Clamp the interpolated value to [0, 255] range + interpolated_value = fmaxf(0.0f, fminf(255.0f, interpolated_value)); + output_img[out_idx + c] = static_cast(interpolated_value); + } +} + +// ------------------------------------------------------------------------- +// Fused Kernel: Anti-Aliased Resize -> Crop -> Normalize -> CHW Layout +// ------------------------------------------------------------------------- +// This kernel mimics PIL's resize (Bilinear/Triangle filter with adaptive support). +// For downscaling, it expands the kernel window to cover all contributing pixels +// (Anti-aliasing). For upscaling, it acts as standard bilinear interpolation. +__global__ void resizeAndExtractRoi_kernel( + const std::uint8_t * __restrict__ input_img, std::uint8_t * __restrict__ output_img, int in_h, + int in_w, // Input dimensions + int resize_h, int resize_w, // Target resize dimensions (before cropping) + int roi_h, int roi_w, // Output ROI dimensions + int roi_y_start, int roi_x_start // Top-left of ROI in the resized coordinate space +) +{ + // 1. Calculate thread target pixel in the ROI (Output Image) + int out_x = blockIdx.x * blockDim.x + threadIdx.x; + int out_y = blockIdx.y * blockDim.y + threadIdx.y; + + if (out_x >= roi_w || out_y >= roi_h) return; + + // 2. Map ROI pixel to the virtual Resized Image coordinates + // (The image that *would* exist if we resized the whole thing) + float resized_x = static_cast(out_x + roi_x_start); + float resized_y = static_cast(out_y + roi_y_start); + + // 3. Calculate Scale Factors (Input / Resized) + // scale > 1.0 means downsampling + float scale_x = static_cast(in_w) / static_cast(resize_w); + float scale_y = static_cast(in_h) / static_cast(resize_h); + + // 4. Map to Input Image Center Point + // Standard alignment: (x + 0.5) * scale - 0.5 + float center_x = (resized_x + 0.5f) * scale_x - 0.5f; + float center_y = (resized_y + 0.5f) * scale_y - 0.5f; + + // 5. Determine Support Size (Kernel Radius) + // For downsampling, support scales with the downsampling factor (Anti-aliasing). + // For upsampling (scale < 1), support is 1.0 (Standard Bilinear). + float support_x = (scale_x > 1.0f) ? scale_x : 1.0f; + float support_y = (scale_y > 1.0f) ? scale_y : 1.0f; + + // 6. Define Sampling Bounds in Input + // We loop over pixels within [center - support, center + support] + int x_min = static_cast(ceilf(center_x - support_x)); + int x_max = static_cast(floorf(center_x + support_x)); + int y_min = static_cast(ceilf(center_y - support_y)); + int y_max = static_cast(floorf(center_y + support_y)); + + // Clamp to input image boundaries + x_min = max(0, x_min); + x_max = min(in_w - 1, x_max); + y_min = max(0, y_min); + y_max = min(in_h - 1, y_max); + + // Accumulators + // float sum_r = 0.0f; + // float sum_g = 0.0f; + // float sum_b = 0.0f; + float sum_first_channel = 0.0f; + float sum_second_channel = 0.0f; + float sum_third_channel = 0.0f; + float sum_weight = 0.0f; + + // 7. Convolution Loop (Triangle/Bilinear Filter) + // TODO(KokSeang): If necessary, optimize with im2col, separate horizontal and vertical + // filtering, etc. + for (int y = y_min; y <= y_max; ++y) { + // Y-weight: Triangle filter (1 - distance / support) + float dy = (center_y - y); + float wy = max(0.0f, 1.0f - abs(dy) / support_y); + + // Optimization: Pre-calculate Y-offset index + int row_offset = y * in_w; + + for (int x = x_min; x <= x_max; ++x) { + // X-weight + float dx = (center_x - x); + float wx = max(0.0f, 1.0f - abs(dx) / support_x); + + // Combined weight + float w = wx * wy; + + // Skip negligible weights + if (w > 0.0f) { + // To save the image in CHW format, the index is calculated as (row_offset + x) * 3. + int idx = (row_offset + x) * 3; + sum_first_channel += input_img[idx] * w; + sum_second_channel += input_img[idx + 1] * w; + sum_third_channel += input_img[idx + 2] * w; + sum_weight += w; + } + } + } + + // 8. Normalize Weights and Write Output + // Avoid division by zero if weight sum is tiny (shouldn't happen inside valid ROI) + if (sum_weight > 0.0f) { + sum_first_channel /= sum_weight; + sum_second_channel /= sum_weight; + sum_third_channel /= sum_weight; + } + + // 9. Normalize (Mean/Std) and Write to Output (Planar CHW) + // Output layout: [Batch/Camera, Channel, Height, Width] + int area = roi_h * roi_w; + int out_idx = out_y * roi_w + out_x; + + // Channel 0 + output_img[(0 * area + out_idx)] = sum_first_channel; + // Channel 1 + output_img[(1 * area + out_idx)] = sum_second_channel; + // Channel 2 + output_img[(2 * area + out_idx)] = sum_third_channel; +} + +cudaError_t CameraPreprocess::resizeAndExtractRoi_launch( + const std::uint8_t * input_img, std::uint8_t * output_img, int H, + int W, // Original image dimensions + int H2, int W2, // Resized image dimensions + int H3, int W3, // ROI dimensions + int y_start, int x_start // ROI top-left coordinates in resized image +) +{ + // Define the block and grid dimensions + dim3 threads(16, 16); + dim3 blocks(divup(W3, threads.x), divup(H3, threads.y)); + + // Launch the kernel + resizeAndExtractRoi_kernel<<>>( + input_img, output_img, H, W, H2, W2, H3, W3, y_start, x_start); + + // Check for errors + return cudaGetLastError(); +} + +cudaError_t CameraPreprocess::remap_launch( + const std::uint8_t * input_img, std::uint8_t * output_img, int output_height, + int output_width, // Output (destination) image dimensions + int input_height, int input_width, // Input (source) image dimensions + const float * map_x, const float * map_y) +{ + // Define the block and grid dimensions based on output size + dim3 threads(16, 16); + dim3 blocks(divup(output_width, threads.x), divup(output_height, threads.y)); + + // Launch the kernel + remap_kernel<<>>( + input_img, output_img, output_height, output_width, input_height, input_width, map_x, map_y); + + // Check for errors + return cudaGetLastError(); +} + +} // namespace autoware::bevfusion diff --git a/perception/autoware_bevfusion/lib/preprocess/preprocess_kernel.cu b/perception/autoware_bevfusion/lib/preprocess/preprocess_kernel.cu index 7686b4aaa98ed..78f4458a73366 100644 --- a/perception/autoware_bevfusion/lib/preprocess/preprocess_kernel.cu +++ b/perception/autoware_bevfusion/lib/preprocess/preprocess_kernel.cu @@ -153,87 +153,4 @@ std::size_t PreprocessCuda::generateVoxels( return real_num_voxels; } -__global__ void resizeAndExtractRoi_kernel( - const std::uint8_t * __restrict__ input_img, std::uint8_t * __restrict__ output_img, int H, - int W, // Original image dimensions (Height, Width) - int H2, int W2, // Resized image dimensions (Height, Width) - int H3, int W3, // ROI dimensions (Height, Width) - int y_start, int x_start) // ROI top-left coordinates in resized image -{ - // Calculate the global thread indices - int i = blockIdx.x * blockDim.x + threadIdx.x; // Width index in output (ROI) image - int j = blockIdx.y * blockDim.y + threadIdx.y; // Height index in output (ROI) image - - // Check if the thread corresponds to a valid pixel in the ROI - if (i >= W3 || j >= H3) return; - - // Compute scaling factors from original to resized image - auto scale_y = static_cast(H) / H2; - auto scale_x = static_cast(W) / W2; - - // Map output pixel (i, j) in ROI to position in resized image - int i_resized = i + x_start; - int j_resized = j + y_start; - - // Map position in resized image back to position in original image - float i_original = (i_resized + 0.5f) * scale_x - 0.5f; - float j_original = (j_resized + 0.5f) * scale_y - 0.5f; - - // Compute coordinates for bilinear interpolation - auto i0 = static_cast(floorf(i_original)); - auto j0 = static_cast(floorf(j_original)); - int i1 = i0 + 1; - int j1 = j0 + 1; - - // Compute interpolation weights - float di = i_original - i0; - float dj = j_original - j0; - - float w00 = (1.0f - di) * (1.0f - dj); - float w01 = (1.0f - di) * dj; - float w10 = di * (1.0f - dj); - float w11 = di * dj; - - // Loop over the three color channels - for (int c = 0; c < 3; ++c) { - float v00 = 0.0f, v01 = 0.0f, v10 = 0.0f, v11 = 0.0f; - - // Boundary checks for each neighboring pixel - if (i0 >= 0 && i0 < W && j0 >= 0 && j0 < H) - v00 = static_cast(input_img[(j0 * W + i0) * 3 + c]); - if (i0 >= 0 && i0 < W && j1 >= 0 && j1 < H) - v01 = static_cast(input_img[(j1 * W + i0) * 3 + c]); - if (i1 >= 0 && i1 < W && j0 >= 0 && j0 < H) - v10 = static_cast(input_img[(j0 * W + i1) * 3 + c]); - if (i1 >= 0 && i1 < W && j1 >= 0 && j1 < H) - v11 = static_cast(input_img[(j1 * W + i1) * 3 + c]); - - // Compute the interpolated pixel value - float value = w00 * v00 + w01 * v01 + w10 * v10 + w11 * v11; - - // Store the result in the output ROI image - output_img[(j * W3 + i) * 3 + c] = static_cast(value); - } -} - -cudaError_t PreprocessCuda::resizeAndExtractRoi_launch( - const std::uint8_t * input_img, std::uint8_t * output_img, int H, - int W, // Original image dimensions - int H2, int W2, // Resized image dimensions - int H3, int W3, // ROI dimensions - int y_start, int x_start, // ROI top-left coordinates in resized image - cudaStream_t stream) -{ - // Define the block and grid dimensions - dim3 threads(16, 16); - dim3 blocks(divup(W3, threads.x), divup(H3, threads.y)); - - // Launch the kernel - resizeAndExtractRoi_kernel<<>>( - input_img, output_img, H, W, H2, W2, H3, W3, y_start, x_start); - - // Check for errors - return cudaGetLastError(); -} - } // namespace autoware::bevfusion diff --git a/perception/autoware_bevfusion/schema/bevfusion.schema.json b/perception/autoware_bevfusion/schema/bevfusion.schema.json index 1e1bb89d1ba48..953922c48b3c9 100644 --- a/perception/autoware_bevfusion/schema/bevfusion.schema.json +++ b/perception/autoware_bevfusion/schema/bevfusion.schema.json @@ -70,6 +70,11 @@ "description": "A name of frame id where world coordinates system is defined with respect to.", "default": "map" }, + "run_image_undistortion": { + "type": "boolean", + "description": "A flag to run image undistortion.", + "default": true + }, "circle_nms_dist_threshold": { "type": "number", "description": "A distance threshold between detections in NMS.", diff --git a/perception/autoware_bevfusion/src/bevfusion_node.cpp b/perception/autoware_bevfusion/src/bevfusion_node.cpp index 4b60b0926b7d5..4574bbda39efa 100644 --- a/perception/autoware_bevfusion/src/bevfusion_node.cpp +++ b/perception/autoware_bevfusion/src/bevfusion_node.cpp @@ -127,8 +127,11 @@ BEVFusionNode::BEVFusionNode(const rclcpp::NodeOptions & options) circle_nms_dist_threshold, yaw_norm_thresholds, score_threshold, use_intensity); sensor_fusion_ = config.sensor_fusion_; + use_compressed_images_ = this->declare_parameter("use_compressed_images", false, descriptor); + const auto run_image_undistortion = + this->declare_parameter("run_image_undistortion", descriptor); DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); @@ -144,6 +147,19 @@ BEVFusionNode::BEVFusionNode(const rclcpp::NodeOptions & options) image_backbone_engine_path, 1ULL << 32U) } : TrtBEVFusionConfig{trt_main_config, std::nullopt}; + + // Build Image Preprocessing Parameters + // TODO(KokSeang): Remove image preprocessing parameters out of BEVFusionConfig + auto image_pre_processing_params = ImagePreProcessingParams( + raw_image_height, + raw_image_width, + roi_height, + roi_width, + img_aug_scale_y, + img_aug_scale_x, + run_image_undistortion + ); + // clang-format on detector_ptr_ = std::make_unique(trt_bevfusion_config, densification_param, config); diagnostics_detector_trt_ = @@ -157,7 +173,7 @@ BEVFusionNode::BEVFusionNode(const rclcpp::NodeOptions & options) objects_pub_ = this->create_publisher( "~/output/objects", rclcpp::QoS(1)); - initializeSensorFusionSubscribers(config.num_cameras_); + initializeSensorFusionSubscribers(config.num_cameras_, image_pre_processing_params); published_time_pub_ = std::make_unique(this); @@ -202,7 +218,7 @@ void BEVFusionNode::cloudCallback( std::unordered_map proc_timing; bool is_num_voxels_within_range = true; const bool is_success = detector_ptr_->detect( - pc_msg_ptr, image_msgs_, camera_masks_, tf_buffer_, det_boxes3d, proc_timing, + pc_msg_ptr, camera_data_ptrs_, camera_masks_, tf_buffer_, det_boxes3d, proc_timing, is_num_voxels_within_range); if (!is_success) { @@ -243,25 +259,24 @@ void BEVFusionNode::cloudCallback( void BEVFusionNode::imageCallback( const sensor_msgs::msg::Image::ConstSharedPtr msg, std::size_t camera_id) { - image_msgs_[camera_id] = msg; + camera_data_ptrs_[camera_id]->update_image_msg(msg); std::size_t num_valid_images = std::count_if( - image_msgs_.begin(), image_msgs_.end(), - [](const auto & image_msg) { return image_msg != nullptr; }); + camera_data_ptrs_.begin(), camera_data_ptrs_.end(), + [](const auto & camera_data) { return camera_data->is_image_msg_available(); }); - images_available_ = num_valid_images == image_msgs_.size(); + images_available_ = num_valid_images == camera_data_ptrs_.size(); } void BEVFusionNode::cameraInfoCallback( const sensor_msgs::msg::CameraInfo & msg, std::size_t camera_id) { - camera_info_msgs_[camera_id] = msg; - + camera_data_ptrs_[camera_id]->update_camera_info(msg); std::size_t num_valid_intrinsics = std::count_if( - camera_info_msgs_.begin(), camera_info_msgs_.end(), - [](const auto & opt) { return opt.has_value(); }); + camera_data_ptrs_.begin(), camera_data_ptrs_.end(), + [](const auto & camera_data) { return camera_data->is_camera_info_available(); }); - intrinsics_available_ = num_valid_intrinsics == camera_info_msgs_.size(); + intrinsics_available_ = num_valid_intrinsics == camera_data_ptrs_.size(); if ( lidar2camera_extrinsics_[camera_id].has_value() || !lidar_frame_.has_value() || diff --git a/perception/autoware_bevfusion/src/bevfusion_node_utils.cpp b/perception/autoware_bevfusion/src/bevfusion_node_utils.cpp index 884c1c0b18f65..6a2c702290f72 100644 --- a/perception/autoware_bevfusion/src/bevfusion_node_utils.cpp +++ b/perception/autoware_bevfusion/src/bevfusion_node_utils.cpp @@ -41,7 +41,8 @@ void BEVFusionNode::validateParameters( } } -void BEVFusionNode::initializeSensorFusionSubscribers(std::int64_t num_cameras) +void BEVFusionNode::initializeSensorFusionSubscribers( + std::int64_t num_cameras, const ImagePreProcessingParams & image_pre_processing_params) { if (!sensor_fusion_) { return; @@ -49,9 +50,9 @@ void BEVFusionNode::initializeSensorFusionSubscribers(std::int64_t num_cameras) image_subs_.resize(num_cameras); camera_info_subs_.resize(num_cameras); - image_msgs_.resize(num_cameras); - camera_info_msgs_.resize(num_cameras); lidar2camera_extrinsics_.resize(num_cameras); + camera_data_ptrs_.resize(num_cameras); + camera_matrices_ptrs_.resize(num_cameras); auto resolve_topic_name = [this](const std::string & query) { return this->get_node_topics_interface()->resolve_topic_name(query); @@ -59,6 +60,15 @@ void BEVFusionNode::initializeSensorFusionSubscribers(std::int64_t num_cameras) const std::string transport = use_compressed_images_ ? "compressed" : "raw"; for (std::int64_t camera_id = 0; camera_id < num_cameras; ++camera_id) { + // First construct CameraMatrices, CameraMatrices is shared by multiple CameraData for the same + // camera_id + camera_matrices_ptrs_[camera_id] = std::make_shared(); + + // Then construct CameraData + camera_data_ptrs_[camera_id] = std::make_unique( + this, static_cast(camera_id), image_pre_processing_params, + camera_matrices_ptrs_[camera_id]); + // Explicitly resolve the topic name using the node name and namespace, please check // https://github.com/ros-perception/image_transport_plugins/issues/155 const std::string base_topic = resolve_topic_name("~/input/image" + std::to_string(camera_id)); @@ -111,8 +121,8 @@ void BEVFusionNode::precomputeIntrinsicsExtrinsics() std::vector lidar2camera_extrinsics; std::transform( - camera_info_msgs_.begin(), camera_info_msgs_.end(), std::back_inserter(camera_info_msgs), - [](const auto & opt) { return *opt; }); + camera_data_ptrs_.begin(), camera_data_ptrs_.end(), std::back_inserter(camera_info_msgs), + [](const auto & camera_data) { return camera_data->camera_info_value(); }); std::transform( lidar2camera_extrinsics_.begin(), lidar2camera_extrinsics_.end(), @@ -124,12 +134,11 @@ void BEVFusionNode::precomputeIntrinsicsExtrinsics() void BEVFusionNode::computeCameraMasks(double lidar_stamp) { - camera_masks_.resize(camera_info_msgs_.size()); + camera_masks_.resize(camera_data_ptrs_.size()); for (std::size_t i = 0; i < camera_masks_.size(); ++i) { + auto camera_mask_timestamp = rclcpp::Time(camera_data_ptrs_[i]->image_msg()->header.stamp); camera_masks_[i] = - (lidar_stamp - rclcpp::Time(image_msgs_[i]->header.stamp).seconds()) < max_camera_lidar_delay_ - ? 1.0 - : 0.f; + (lidar_stamp - camera_mask_timestamp.seconds()) < max_camera_lidar_delay_ ? 1.0 : 0.f; } }