Skip to content

Commit d6e3535

Browse files
mojomexCopilot
andcommitted
perf(autoware_tensorrt_plugins): keep SegmentCSR allocation-free
Initialize the SegmentCSR output buffer directly instead of allocating, filling, copying, and freeing a scratch base buffer on every launch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Max SCHMELLER <max.schmeller@tier4.jp>
1 parent ccee12e commit d6e3535

1 file changed

Lines changed: 1 addition & 8 deletions

File tree

perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,8 @@ int32_t segment_csr_launch(
130130
fill_kernel<int64_t>
131131
<<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(arg_indices_out, out_numel, num_rows_in);
132132

133-
scalar_t * base_values{nullptr};
134-
cudaMallocAsync(&base_values, sizeof(scalar_t) * out_numel, stream_in);
135133
fill_kernel<scalar_t><<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(
136-
base_values, out_numel, static_cast<scalar_t>(0));
137-
cudaMemcpyAsync(
138-
reduced_values_out, base_values, sizeof(scalar_t) * out_numel, cudaMemcpyDeviceToDevice,
139-
stream_in);
134+
reduced_values_out, out_numel, static_cast<scalar_t>(0));
140135

141136
if (num_cols_in == 1)
142137
segment_csr_kernel<scalar_t, REDUCE, 1><<<BLOCKS(32, num_segments), THREADS, 0, stream_in>>>(
@@ -145,8 +140,6 @@ int32_t segment_csr_launch(
145140
segment_csr_broadcast_kernel<scalar_t, REDUCE>
146141
<<<BLOCKS(1, num_segments * num_cols_in), THREADS, 0, stream_in>>>(
147142
src_in, indptr_in, reduced_values_out, arg_indices_out, num_segments, num_cols_in);
148-
149-
cudaFreeAsync(base_values, stream_in);
150143
return 0;
151144
}
152145

0 commit comments

Comments
 (0)