-
Notifications
You must be signed in to change notification settings - Fork 892
feat(autoware_ptv3): source cloud reconstruction & entropy #12547
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
55caa03
4fc27a3
49bc420
5894e2d
d3bd08b
4693245
2d0f101
6dfc62d
f6433aa
efc7ab1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Temporary changes made it into the PR |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,19 +26,21 @@ struct OutputSegmentationPointType | |
| float z; | ||
| std::uint8_t class_id; | ||
| float probability; | ||
| float entropy; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this expected to be used by downstream modules? If it's only for debug/evaluation it should ideally be disabled/excluded in a production context.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ktro2828 do you plan to use Anyway, if
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @amadeuszsz From the perspective of the downstream module, it's fine to change the field unless the existing fields are not deleted. |
||
| } __attribute__((packed)); | ||
|
|
||
| __global__ void createVisualizationPointcloudKernel( | ||
| const float4 * input_features, const float * colors, const std::int64_t * labels, | ||
| float4 * output_points, std::size_t num_points) | ||
| float4 * output_points, std::size_t num_classes, std::size_t num_points) | ||
| { | ||
| const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x); | ||
| if (idx >= num_points) { | ||
| return; | ||
| } | ||
|
|
||
| const auto label = labels[idx]; | ||
| const auto color = colors[label]; | ||
| const auto color = | ||
| label >= 0 && static_cast<std::size_t>(label) < num_classes ? colors[label] : 0.0f; | ||
|
|
||
| output_points[idx] = | ||
| make_float4(input_features[idx].x, input_features[idx].y, input_features[idx].z, color); | ||
|
|
@@ -55,11 +57,75 @@ __global__ void createSegmentationPointcloudKernel( | |
|
|
||
| const auto input_point = input_features[idx]; | ||
| const auto label = labels[idx]; | ||
| const bool has_valid_label = label >= 0 && static_cast<std::size_t>(label) < num_classes; | ||
| float entropy = 0.0f; | ||
| for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { | ||
| const auto probability = pred_probs[idx * num_classes + class_idx]; | ||
| if (probability > 0.0f) { | ||
| entropy -= probability * logf(probability); | ||
| } | ||
| } | ||
| if (num_classes > 1) { | ||
| entropy /= logf(static_cast<float>(num_classes)); | ||
| } | ||
| output_points[idx].x = input_point.x; | ||
| output_points[idx].y = input_point.y; | ||
| output_points[idx].z = input_point.z; | ||
| output_points[idx].class_id = static_cast<std::uint8_t>(label); | ||
| output_points[idx].probability = pred_probs[idx * num_classes + label]; | ||
| output_points[idx].class_id = has_valid_label ? static_cast<std::uint8_t>(label) : 255U; | ||
| output_points[idx].probability = has_valid_label ? pred_probs[idx * num_classes + label] : 0.0f; | ||
| output_points[idx].entropy = entropy; | ||
| } | ||
|
|
||
| __global__ void reconstructPartialKernel( | ||
| const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs, | ||
| std::int64_t * output_labels, float * output_probs, std::size_t num_classes, | ||
| std::size_t num_cropped_points, std::size_t num_voxels) | ||
|
amadeuszsz marked this conversation as resolved.
Outdated
|
||
| { | ||
| const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x); | ||
| if (idx >= num_cropped_points) { | ||
| return; | ||
| } | ||
|
|
||
| const auto voxel_idx = inverse_map[idx]; | ||
| const bool has_valid_voxel = voxel_idx >= 0 && static_cast<std::size_t>(voxel_idx) < num_voxels; | ||
| output_labels[idx] = has_valid_voxel ? voxel_labels[voxel_idx] : 255; | ||
| for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { | ||
| output_probs[idx * num_classes + class_idx] = | ||
| has_valid_voxel ? voxel_probs[voxel_idx * num_classes + class_idx] : 0.0f; | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At a glance, it looks like this loop still can be parallelized because each iteration is independent of the others. Do you think we can add one more (i.e.,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested it and postprocessing went up from 4 ms to 7 ms:
Each block does trivially little work, so the GPU block scheduler overhead far exceeds the cost of the original 20~ iteration
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for testing. If you have spare time, it would be great if you also try adding This is not mandatory. I can approve this PR if there are no other unhandled comments from other reviews.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried multiple configuration, seems this is the best. Results are very close to these reported in PR description. Note: I assumed more than 32 classes (soon) and this is important due to warp size. |
||
| } | ||
|
|
||
| __global__ void reconstructFullKernel( | ||
| const std::uint32_t * crop_mask, const std::uint32_t * crop_indices, | ||
| const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs, | ||
| std::int64_t * output_labels, float * output_probs, std::size_t num_classes, | ||
| std::size_t num_points, std::size_t num_voxels) | ||
| { | ||
| const auto idx = static_cast<std::uint32_t>(blockIdx.x * blockDim.x + threadIdx.x); | ||
| if (idx >= num_points) { | ||
| return; | ||
| } | ||
|
|
||
| for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { | ||
| output_probs[idx * num_classes + class_idx] = 0.0f; | ||
| } | ||
|
|
||
| if (crop_mask[idx] == 0) { | ||
| output_labels[idx] = 255; | ||
| return; | ||
| } | ||
|
|
||
| const auto cropped_idx = crop_indices[idx] - 1; | ||
| const auto voxel_idx = inverse_map[cropped_idx]; | ||
| if (voxel_idx < 0 || static_cast<std::size_t>(voxel_idx) >= num_voxels) { | ||
| output_labels[idx] = 255; | ||
| return; | ||
| } | ||
|
|
||
| output_labels[idx] = voxel_labels[voxel_idx]; | ||
| for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { | ||
| output_probs[idx * num_classes + class_idx] = voxel_probs[voxel_idx * num_classes + class_idx]; | ||
| } | ||
| } | ||
|
|
||
| template <typename OutputPointT> | ||
|
|
@@ -213,13 +279,13 @@ PostprocessCuda::PostprocessCuda(const PTv3Config & config, cudaStream_t stream) | |
|
|
||
| void PostprocessCuda::createVisualizationPointcloud( | ||
| const float * input_features, const std::int64_t * labels, float * output_points, | ||
| std::size_t num_points) | ||
| std::size_t num_classes, std::size_t num_points) | ||
| { | ||
| auto num_blocks = divup(num_points, config_.threads_per_block_); | ||
|
|
||
| createVisualizationPointcloudKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>( | ||
| reinterpret_cast<const float4 *>(input_features), color_map_d_.get(), labels, | ||
| reinterpret_cast<float4 *>(output_points), num_points); | ||
| reinterpret_cast<float4 *>(output_points), num_classes, num_points); | ||
|
|
||
| CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); | ||
| } | ||
|
|
@@ -237,6 +303,35 @@ void PostprocessCuda::createSegmentationPointcloud( | |
| CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); | ||
| } | ||
|
|
||
| void PostprocessCuda::reconstructPartial( | ||
| const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs, | ||
| std::int64_t * output_labels, float * output_probs, std::size_t num_classes, | ||
| std::size_t num_cropped_points, std::size_t num_voxels) | ||
| { | ||
| auto num_blocks = divup(num_cropped_points, config_.threads_per_block_); | ||
|
|
||
| reconstructPartialKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>( | ||
| inverse_map, voxel_labels, voxel_probs, output_labels, output_probs, num_classes, | ||
| num_cropped_points, num_voxels); | ||
|
|
||
| CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); | ||
| } | ||
|
|
||
| void PostprocessCuda::reconstructFull( | ||
| const std::uint32_t * crop_mask, const std::uint32_t * crop_indices, | ||
| const std::int64_t * inverse_map, const std::int64_t * voxel_labels, const float * voxel_probs, | ||
| std::int64_t * output_labels, float * output_probs, std::size_t num_classes, | ||
| std::size_t num_points, std::size_t num_voxels) | ||
| { | ||
| auto num_blocks = divup(num_points, config_.threads_per_block_); | ||
|
|
||
| reconstructFullKernel<<<num_blocks, config_.threads_per_block_, 0, stream_>>>( | ||
| crop_mask, crop_indices, inverse_map, voxel_labels, voxel_probs, output_labels, output_probs, | ||
| num_classes, num_points, num_voxels); | ||
|
|
||
| CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); | ||
| } | ||
|
|
||
| std::size_t PostprocessCuda::createFilteredPointcloud( | ||
| const void * compact_input_points, CloudFormat input_format, CloudFormat output_format, | ||
| const float * pred_probs, void * output_points, std::size_t num_classes, std::size_t num_points) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.