From 2ff93e3a2b36f0147906a7ad9a1fc65caad6fe03 Mon Sep 17 00:00:00 2001
From: Max SCHMELLER <max.schmeller@tier4.jp>
Date: Wed, 10 Jun 2026 17:49:44 +0900
Subject: [PATCH] perf(autoware_tensorrt_plugins): keep SegmentCSR
 allocation-free

Initialize the SegmentCSR output buffer directly instead of allocating, filling, copying, and freeing a scratch base buffer on every launch.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: Max SCHMELLER <max.schmeller@tier4.jp>
---
 .../src/scatter_ops/segment_csr.cu                       | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu b/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu
index e4d5484d3c3af..1ab2367f79a74 100644
--- a/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu
+++ b/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu
@@ -130,13 +130,8 @@ int32_t segment_csr_launch(
     fill_kernel<int64_t>
       <<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(arg_indices_out, out_numel, num_rows_in);
 
-  scalar_t * base_values{nullptr};
-  cudaMallocAsync(&base_values, sizeof(scalar_t) * out_numel, stream_in);
   fill_kernel<scalar_t><<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(
-    base_values, out_numel, static_cast<scalar_t>(0));
-  cudaMemcpyAsync(
-    reduced_values_out, base_values, sizeof(scalar_t) * out_numel, cudaMemcpyDeviceToDevice,
-    stream_in);
+    reduced_values_out, out_numel, static_cast<scalar_t>(0));
 
   if (num_cols_in == 1)
     segment_csr_kernel<scalar_t, REDUCE, 1><<<BLOCKS(32, num_segments), THREADS, 0, stream_in>>>(
@@ -145,8 +140,6 @@ int32_t segment_csr_launch(
     segment_csr_broadcast_kernel<scalar_t, REDUCE>
       <<<BLOCKS(1, num_segments * num_cols_in), THREADS, 0, stream_in>>>(
         src_in, indptr_in, reduced_values_out, arg_indices_out, num_segments, num_cols_in);
-
-  cudaFreeAsync(base_values, stream_in);
   return 0;
 }