From 2ff93e3a2b36f0147906a7ad9a1fc65caad6fe03 Mon Sep 17 00:00:00 2001 From: Max SCHMELLER Date: Wed, 10 Jun 2026 17:49:44 +0900 Subject: [PATCH] perf(autoware_tensorrt_plugins): keep SegmentCSR allocation-free Initialize the SegmentCSR output buffer directly instead of allocating, filling, copying, and freeing a scratch base buffer on every launch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Max SCHMELLER --- .../src/scatter_ops/segment_csr.cu | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu b/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu index e4d5484d3c3af..1ab2367f79a74 100644 --- a/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu +++ b/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu @@ -130,13 +130,8 @@ int32_t segment_csr_launch( fill_kernel <<>>(arg_indices_out, out_numel, num_rows_in); - scalar_t * base_values{nullptr}; - cudaMallocAsync(&base_values, sizeof(scalar_t) * out_numel, stream_in); fill_kernel<<>>( - base_values, out_numel, static_cast(0)); - cudaMemcpyAsync( - reduced_values_out, base_values, sizeof(scalar_t) * out_numel, cudaMemcpyDeviceToDevice, - stream_in); + reduced_values_out, out_numel, static_cast(0)); if (num_cols_in == 1) segment_csr_kernel<<>>( @@ -145,8 +140,6 @@ int32_t segment_csr_launch( segment_csr_broadcast_kernel <<>>( src_in, indptr_in, reduced_values_out, arg_indices_out, num_segments, num_cols_in); - - cudaFreeAsync(base_values, stream_in); return 0; }