Skip to content
Open
1 change: 1 addition & 0 deletions perception/autoware_ptv3/config/ptv3.param.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# network
trt_precision: fp16
cloud_capacity: 2000000
source_reconstruction: full
onnx_path: "$(var model_path)/ptv3.onnx"
Comment thread
amadeuszsz marked this conversation as resolved.
engine_path: "$(var model_path)/ptv3.engine"
filter:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,23 @@ class PostprocessCuda

void createVisualizationPointcloud(
const float * input_features, const std::int64_t * pred_labels, float * output_points,
std::size_t num_points);
std::size_t num_classes, std::size_t num_points);

void createSegmentationPointcloud(
const float * input_features, const std::int64_t * pred_labels, const float * pred_probs,
std::uint8_t * output_points, std::size_t num_classes, std::size_t num_points);

void reconstructPartial(
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_cropped_points, std::size_t num_voxels);

void reconstructFull(
const std::uint32_t * crop_mask, const std::uint32_t * crop_indices,
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_points, std::size_t num_voxels);

std::size_t createFilteredPointcloud(
const void * compact_input_points, CloudFormat input_format, CloudFormat output_format,
const float * pred_probs, void * output_points, std::size_t num_classes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ class PreprocessCuda
std::size_t generateFeatures(
const void * input_data, CloudFormat input_format, unsigned int num_points,
float * voxel_features, std::int64_t * voxel_coords, std::int64_t * voxel_hashes,
void * compact_points);
void * compact_points, float * reconstruction_features, void * cropped_source_points,
std::int64_t * inverse_map, std::size_t * num_cropped_points);

[[nodiscard]] const std::uint32_t * cropMask() const { return crop_mask_d_.get(); }
[[nodiscard]] const std::uint32_t * cropIndices() const { return crop_indices_d_.get(); }

private:
PTv3Config config_;
Expand Down
25 changes: 24 additions & 1 deletion perception/autoware_ptv3/include/autoware/ptv3/ptv3_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@
namespace autoware::ptv3
{

enum class SourceReconstruction {
NONE,
PARTIAL,
FULL,
};

class PTv3Config
{
public:
Expand All @@ -36,7 +42,8 @@ class PTv3Config
const std::vector<std::int64_t> & voxels_num, const std::vector<float> & point_cloud_range,
const std::vector<float> & voxel_size, const std::vector<std::string> & class_names,
const std::vector<std::int64_t> & palette, const float filter_class_probability_threshold,
const std::vector<std::string> & filter_classes, const std::string & filter_output_format)
const std::vector<std::string> & filter_classes, const std::string & filter_output_format,
const std::string & source_reconstruction)
{
plugins_path_ = plugins_path;

Expand Down Expand Up @@ -91,6 +98,21 @@ class PTv3Config
filter_class_probability_threshold_ = filter_class_probability_threshold;
filter_class_indices_ = make_filter_class_indices(class_names_, filter_classes);
filter_output_format_ = filter_output_format;
source_reconstruction_ = parse_source_reconstruction(source_reconstruction);
}

static SourceReconstruction parse_source_reconstruction(const std::string & value)
{
if (value == "none") {
return SourceReconstruction::NONE;
}
if (value == "partial") {
return SourceReconstruction::PARTIAL;
}
if (value == "full") {
return SourceReconstruction::FULL;
}
throw std::runtime_error("source_reconstruction must be one of: 'none', 'partial', or 'full'.");
}

static std::vector<std::uint32_t> make_filter_class_indices(
Expand Down Expand Up @@ -155,6 +177,7 @@ class PTv3Config
float filter_class_probability_threshold_{};
std::vector<std::uint32_t> filter_class_indices_;
std::string filter_output_format_;
SourceReconstruction source_reconstruction_{SourceReconstruction::NONE};

// Common network parameters
std::int64_t cloud_capacity_{};
Expand Down
8 changes: 8 additions & 0 deletions perception/autoware_ptv3/include/autoware/ptv3/ptv3_trt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,16 @@ class PTV3_PUBLIC PTv3TRT

// Preprocess outputs
std::int64_t num_voxels_{0};
std::int64_t num_cropped_points_{0}; // only for partial
std::int64_t num_source_points_{0}; // only for full
const void * current_input_data_{nullptr}; // only for full

CudaUniquePtr<std::uint8_t[]> compact_points_d_{nullptr};
CudaUniquePtr<std::uint8_t[]> cropped_source_points_d_{nullptr}; // only for partial
CudaUniquePtr<float[]> reconstructed_features_d_{nullptr}; // only for partial and full
CudaUniquePtr<std::int64_t[]> inverse_map_d_{nullptr}; // only for partial and full
CudaUniquePtr<std::int64_t[]> reconstructed_labels_d_{nullptr}; // only for partial and full
CudaUniquePtr<float[]> reconstructed_probs_d_{nullptr}; // only for partial and full
CudaUniquePtr<std::int64_t[]> grid_coord_d_{nullptr};
CudaUniquePtr<float[]> feat_d_{nullptr};
CudaUniquePtr<std::int64_t[]> serialized_code_d_{nullptr};
Expand Down
8 changes: 4 additions & 4 deletions perception/autoware_ptv3/launch/ptv3.launch.xml
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporary changes made it into the PR

Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@
<group unless="$(var use_pointcloud_container)">
<node pkg="autoware_ptv3" exec="autoware_ptv3_node" name="ptv3" output="screen" args="--ros-args --log-level $(var log_level)">
<remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
<remap from="~/input/pointcloud/cuda" to="$(var input/pointcloud)/cuda"/>
<remap from="~/input/pointcloud/cuda" to="$(var input/pointcloud)/cuda/dummy"/>
<remap from="~/output/pointcloud/segmentation" to="$(var output/pointcloud/segmentation)"/>
<remap from="~/output/pointcloud/segmentation/cuda" to="$(var output/pointcloud/segmentation)/cuda"/>
<remap from="~/output/pointcloud/segmentation/cuda" to="$(var output/pointcloud/segmentation)/cuda/dummy"/>
<remap from="~/output/pointcloud/visualization" to="$(var output/pointcloud/visualization)"/>
<remap from="~/output/pointcloud/visualization/cuda" to="$(var output/pointcloud/visualization)/cuda"/>
<remap from="~/output/pointcloud/visualization/cuda" to="$(var output/pointcloud/visualization)/cuda/dummy"/>
<remap from="~/output/pointcloud/filtered" to="$(var output/pointcloud/filtered)"/>
<remap from="~/output/pointcloud/filtered/cuda" to="$(var output/pointcloud/filtered)/cuda"/>
<remap from="~/output/pointcloud/filtered/cuda" to="$(var output/pointcloud/filtered)/cuda/dummy"/>

<param from="$(var model_param_path)" allow_substs="true"/>
<param from="$(var ml_package_param_path)" allow_substs="true"/>
Expand Down
103 changes: 97 additions & 6 deletions perception/autoware_ptv3/lib/postprocess/postprocess_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,21 @@ struct OutputSegmentationPointType
float z;
std::uint8_t class_id;
float probability;
float entropy;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this expected to be used by downstream modules? If it's only for debug/evaluation it should ideally be disabled/excluded in a production context.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ktro2828 do you plan to use ~/ptv3/segmentation topic? Are you ok with this change?

Anyway, if segmentation topic will be used in any downstream module, I think entropy can be useful as probability can be overconfident. So far PTv3 does not support real confidence output, therefore maybe we can keep entropy as extra measurement. Please note we do not use this publisher if not subscribed, so the overhead of this change is negligible.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amadeuszsz From the perspective of the downstream module, it's fine to change the field unless the existing fields are not deleted.
Currently, the downstream module refers to x/y/z and optionally class_id/probability by field name using PointCloud2ConstIterator, so adding some other field does not affect iteration order.

} __attribute__((packed));

__global__ void createVisualizationPointcloudKernel(
const float4 * input_features, const float * colors, const std::int64_t * labels,
float4 * output_points, std::size_t num_points)
float4 * output_points, std::size_t num_classes, std::size_t num_points)
{
const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx >= num_points) {
return;
}

const auto label = labels[idx];
const auto color = colors[label];
const auto color =
label >= 0 && static_cast<std::size_t>(label) < num_classes ? colors[label] : 0.0f;

output_points[idx] =
make_float4(input_features[idx].x, input_features[idx].y, input_features[idx].z, color);
Expand All @@ -55,11 +57,71 @@ __global__ void createSegmentationPointcloudKernel(

const auto input_point = input_features[idx];
const auto label = labels[idx];
const bool has_valid_label = label >= 0 && static_cast<std::size_t>(label) < num_classes;
float entropy = 0.0f;
for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
const auto probability = pred_probs[idx * num_classes + class_idx];
if (probability > 0.0f) {
entropy -= probability * logf(probability);
}
}
if (num_classes > 1) {
entropy /= logf(static_cast<float>(num_classes));
}
output_points[idx].x = input_point.x;
output_points[idx].y = input_point.y;
output_points[idx].z = input_point.z;
output_points[idx].class_id = static_cast<std::uint8_t>(label);
output_points[idx].probability = pred_probs[idx * num_classes + label];
output_points[idx].class_id = has_valid_label ? static_cast<std::uint8_t>(label) : 255U;
output_points[idx].probability = has_valid_label ? pred_probs[idx * num_classes + label] : 0.0f;
output_points[idx].entropy = entropy;
}

__global__ void reconstructPartialKernel(
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_cropped_points, std::size_t num_voxels)
Comment thread
amadeuszsz marked this conversation as resolved.
Outdated
{
const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx >= num_cropped_points) {
return;
}

const auto voxel_idx = inverse_map[idx];
const bool has_valid_voxel = voxel_idx >= 0 && static_cast<std::size_t>(voxel_idx) < num_voxels;
output_labels[idx] = has_valid_voxel ? voxel_labels[voxel_idx] : 255;
for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
output_probs[idx * num_classes + class_idx] =
has_valid_voxel ? voxel_probs[voxel_idx * num_classes + class_idx] : 0.0f;
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At a glance, it looks like this loop still can be parallelized because each iteration is independent of the others. Do you think we can add one more (i.e., y) block dimension for more parallelization?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested it and postprocessing went up from 4 ms to 7 ms:

  • preprocessing: 2.65 ms
  • inference: 35.48 ms
  • postprocessing: 7.03 ms
  • total: 45.18 ms

Each block does trivially little work, so the GPU block scheduler overhead far exceeds the cost of the original 20~ iteration for loop.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for testing. If you have spare time, it would be great if you also try adding y dimension to the grid instead of the block (i.e., auto y = blockIdx.y; instead of auto y = threadIdx.y;) because this generates a separate executable unit that would be executed on the different SMs, which requires less overhead.

This is not mandatory. I can approve this PR if there are no other unhandled comments from other reviews.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried multiple configuration, seems this is the best. Results are very close to these reported in PR description.

Note: I assumed more than 32 classes (soon) and this is important due to warp size.

}

__global__ void reconstructFullKernel(
const std::uint32_t * crop_mask, const std::uint32_t * crop_indices,
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_points, std::size_t num_voxels)
{
const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx >= num_points) {
return;
}

if (crop_mask[idx] == 0) {
output_labels[idx] = 255;
return;
}

const auto cropped_idx = crop_indices[idx] - 1;
const auto voxel_idx = inverse_map[cropped_idx];
if (voxel_idx < 0 || static_cast<std::size_t>(voxel_idx) >= num_voxels) {
output_labels[idx] = 255;
return;
}

output_labels[idx] = voxel_labels[voxel_idx];
for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
output_probs[idx * num_classes + class_idx] = voxel_probs[voxel_idx * num_classes + class_idx];
}
}

template <typename OutputPointT>
Expand Down Expand Up @@ -213,13 +275,13 @@ PostprocessCuda::PostprocessCuda(const PTv3Config & config, cudaStream_t stream)

void PostprocessCuda::createVisualizationPointcloud(
const float * input_features, const std::int64_t * labels, float * output_points,
std::size_t num_points)
std::size_t num_classes, std::size_t num_points)
{
auto num_blocks = divup(num_points, config_.threads_per_block_);

createVisualizationPointcloudKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>(
reinterpret_cast<const float4 *>(input_features), color_map_d_.get(), labels,
reinterpret_cast<float4 *>(output_points), num_points);
reinterpret_cast<float4 *>(output_points), num_classes, num_points);

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
}
Expand All @@ -237,6 +299,35 @@ void PostprocessCuda::createSegmentationPointcloud(
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
}

void PostprocessCuda::reconstructPartial(
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_cropped_points, std::size_t num_voxels)
{
auto num_blocks = divup(num_cropped_points, config_.threads_per_block_);

reconstructPartialKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>(
inverse_map, voxel_labels, voxel_probs, output_labels, output_probs, num_classes,
num_cropped_points, num_voxels);

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
}

void PostprocessCuda::reconstructFull(
const std::uint32_t * crop_mask, const std::uint32_t * crop_indices,
const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs,
std::int64_t * output_labels, float * output_probs, std::size_t num_classes,
std::size_t num_points, std::size_t num_voxels)
{
auto num_blocks = divup(num_points, config_.threads_per_block_);

reconstructFullKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>(
crop_mask, crop_indices, inverse_map, voxel_labels, voxel_probs, output_labels, output_probs,
num_classes, num_points, num_voxels);

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
}

std::size_t PostprocessCuda::createFilteredPointcloud(
const void * compact_input_points, CloudFormat input_format, CloudFormat output_format,
const float * pred_probs, void * output_points, std::size_t num_classes, std::size_t num_points)
Expand Down
Loading
Loading