refactor(autoware_tensorrt_plugins): remove SegmentCSR nD indptr path

mojomex · Copilot · mojomex · commit 0f14e0a9b7a6 · 2026-06-11T18:06:13.000+09:00
Remove the unused n-dimensional indptr launcher path and offset helper now that the plugin contract is explicitly 1D.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
Signed-off-by: Max SCHMELLER &lt;max.schmeller@tier4.jp&gt;
diff --git a/perception/autoware_tensorrt_plugins/include/autoware/scatter_ops/segment_csr.h b/perception/autoware_tensorrt_plugins/include/autoware/scatter_ops/segment_csr.h
@@ -26,18 +26,37 @@
 #include <cuda_runtime.h>
 
 #include <cstdint>
-#include <vector>
 
+/// Reduces a row-major `src` matrix along contiguous CSR segments on `stream_in`.
+///
+/// Each output segment `s` is the `REDUCE` of the rows
+/// `src_in[indptr_in[s] : indptr_in[s + 1]]`, applied per column.
+///
+/// Input contract:
+/// - `src_in` points to `num_rows_in * num_cols_in` `scalar_t` values, row-major.
+/// - `num_rows_in`, `num_cols_in`, and `indptr_size_in` must be non-negative.
+/// - `indptr_in` points to `indptr_size_in` int64 CSR row pointers in non-decreasing order
+///   with `0 <= indptr_in[i] <= num_rows_in`. The segment count is
+///   `num_segments = max(indptr_size_in - 1, 0)`; when `indptr_size_in == 0`, `indptr_in` is not
+///   read and no output segments are produced.
+/// - `reduced_values_out` points to `num_segments * num_cols_in` `scalar_t` values.
+/// - `arg_indices_out` is only used for `REDUCE == MIN` or `REDUCE == MAX`. When non-null for those
+///   reductions it points to `num_segments * num_cols_in` int64 values. Pass `nullptr` for SUM,
+///   MEAN, MUL, and DIV, or when argmin/argmax indices are not needed.
+/// - All pointers are device pointers.
+///
+/// Output contract:
+/// - Returns `0` if all work was enqueued successfully, or `-1` if any size argument is negative.
+/// - `reduced_values_out[s * num_cols_in + c]` contains the reduction of column `c` of
+///   `src_in[indptr_in[s] : indptr_in[s + 1]]`. Empty segments produce the reducer's empty value:
+///   `0` for SUM, MEAN, MIN, and MAX, and `1` for MUL and DIV.
+/// - When written, `arg_indices_out[s * num_cols_in + c]` contains the source row index that
+///   produced the min/max for that output cell. Cells corresponding to empty segments are left at
+///   the sentinel value `num_rows_in` (an out-of-range index).
 template <typename scalar_t, ReductionType REDUCE>
 int32_t segment_csr_launch(
-  const scalar_t * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,
-  const std::vector<int32_t> & indptr_size_in, const scalar_t * base_values_in,
-  scalar_t * reduced_values_out, int64_t * arg_indices_out, cudaStream_t stream_in);
-
-template <typename scalar_t, ReductionType REDUCE>
-int32_t segment_csr_launch(
-  const scalar_t * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,
-  const std::vector<int32_t> & indptr_size_in, scalar_t * reduced_values_out,
-  int64_t * arg_indices_out, cudaStream_t stream_in);
+  const scalar_t * src_in, int32_t num_rows_in, int32_t num_cols_in, const int64_t * indptr_in,
+  int32_t indptr_size_in, scalar_t * reduced_values_out, int64_t * arg_indices_out,
+  cudaStream_t stream_in);
 
 #endif  // AUTOWARE__SCATTER_OPS__SEGMENT_CSR_H_
diff --git a/perception/autoware_tensorrt_plugins/include/autoware/scatter_ops/utils.cuh b/perception/autoware_tensorrt_plugins/include/autoware/scatter_ops/utils.cuh
@@ -56,18 +56,4 @@ __global__ void div_kernel(
   src[thread_idx] /= divisor[thread_idx];
 }
 
-// cSpell:ignore indptr
-inline __host__ __device__ int indptr_to_offset(
-  const int64_t * indptr_size, int32_t indptr_dim, int32_t idx)
-{
-  int offset = idx % (indptr_size[indptr_dim - 1] - 1), stride = 1;
-  idx /= indptr_size[indptr_dim - 1] - 1;
-  for (int i = indptr_dim - 2; i >= 0; --i) {
-    stride *= indptr_size[i + 1];
-    offset += (idx % indptr_size[i]) * stride;
-    idx /= indptr_size[i];
-  }
-  return offset;
-}
-
 #endif  // AUTOWARE__SCATTER_OPS_UTILS_CUH
diff --git a/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu b/perception/autoware_tensorrt_plugins/src/scatter_ops/segment_csr.cu
@@ -24,22 +24,16 @@
 #include "autoware/scatter_ops/utils.cuh"
 
 #include <algorithm>
-#include <numeric>
 #include <string>
-#include <vector>
 
 #define THREADS 256
 #define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
 #define FULL_MASK 0xffffffff
-#define SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, R)                                                  \
-  template int32_t segment_csr_launch<T, R>(                                                       \
-    const T * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,         \
-    const std::vector<int32_t> & indptr_size_in, T * reduced_values_out,                           \
-    int64_t * arg_indices_out, cudaStream_t stream_in);                                            \
-  template int32_t segment_csr_launch<T, R>(                                                       \
-    const T * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,         \
-    const std::vector<int32_t> & indptr_size_in, const T * base_values_in, T * reduced_values_out, \
-    int64_t * arg_indices_out, cudaStream_t stream_in);
+#define SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, R)                                          \
+  template int32_t segment_csr_launch<T, R>(                                               \
+    const T * src_in, int32_t num_rows_in, int32_t num_cols_in, const int64_t * indptr_in, \
+    int32_t indptr_size_in, T * reduced_values_out, int64_t * arg_indices_out,             \
+    cudaStream_t stream_in);
 #define SEGMENT_CSR_LAUNCH_INSTANTIATION(T)                   \
   SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, ReductionType::SUM)  \
   SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, ReductionType::MEAN) \
@@ -48,42 +42,27 @@
   SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, ReductionType::MIN)  \
   SEGMENT_CSR_LAUNCH_INSTANTIATION_TR(T, ReductionType::MAX)
 
-namespace
-{
-size_t get_output_numel(
-  const std::vector<int32_t> & src_size, const std::vector<int32_t> & indptr_size, size_t dim)
-{
-  size_t out_numel = static_cast<size_t>(std::max<int32_t>(indptr_size[dim] - 1, 0));
-  for (size_t i = 0; i < src_size.size(); ++i) {
-    if (i != dim) out_numel *= static_cast<size_t>(src_size[i]);
-  }
-  return out_numel;
-}
-}  // namespace
-
 template <typename scalar_t, ReductionType REDUCE, int TB>
 __global__ void segment_csr_kernel(
-  const scalar_t * src, const int64_t * indptr, const int64_t * indptr_size, int32_t indptr_dim,
-  scalar_t * out, int64_t * arg_out, size_t N, size_t E)
+  const scalar_t * src_in, const int64_t * indptr_in, scalar_t * reduced_values_out,
+  int64_t * arg_indices_out, size_t num_segments_in)
 {
   // Each warp processes exactly `32/TB` rows and aggregates all row values
   // via a parallel reduction.
 
   int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int row_idx = thread_idx / TB;
   int lane_idx = thread_idx & (TB - 1);
-  if (row_idx >= N) return;
+  if (row_idx >= num_segments_in) return;
 
-  int offset = indptr_to_offset(indptr_size, indptr_dim, row_idx);
-  int64_t row_start = __ldg(indptr + offset);
-  int64_t row_end = __ldg(indptr + offset + 1);
+  int64_t row_start = __ldg(indptr_in + row_idx);
+  int64_t row_end = __ldg(indptr_in + row_idx + 1);
 
   scalar_t val = Reducer<scalar_t, REDUCE>::init();
-  int64_t arg, arg_tmp;
+  int64_t arg{0}, arg_tmp{0};
 
-  offset = (row_idx / (indptr_size[indptr_dim - 1] - 1)) * E;
   for (int64_t src_idx = row_start + lane_idx; src_idx < row_end; src_idx += TB)
-    Reducer<scalar_t, REDUCE>::update(&val, src[offset + src_idx], &arg, src_idx);
+    Reducer<scalar_t, REDUCE>::update(&val, src_in[src_idx], &arg, src_idx);
 
 #pragma unroll
   for (int i = TB / 2; i > 0; i /= 2) {
@@ -94,124 +73,78 @@ __global__ void segment_csr_kernel(
   }
 
   if (lane_idx == 0)
-    if (arg_out != nullptr)
+    if (arg_indices_out != nullptr)
       Reducer<scalar_t, REDUCE>::write(
-        out + row_idx, val, arg_out + row_idx, arg, row_end - row_start);
+        reduced_values_out + row_idx, val, arg_indices_out + row_idx, arg, row_end - row_start);
     else
-      Reducer<scalar_t, REDUCE>::write(out + row_idx, val, row_end - row_start);
+      Reducer<scalar_t, REDUCE>::write(reduced_values_out + row_idx, val, row_end - row_start);
 }
 
 template <typename scalar_t, ReductionType REDUCE>
 __global__ void segment_csr_broadcast_kernel(
-  const scalar_t * src, const int64_t * indptr, const int64_t * indptr_size, int32_t indptr_dim,
-  scalar_t * out, int64_t * arg_out, size_t N, size_t K, size_t E)
+  const scalar_t * src_in, const int64_t * indptr_in, scalar_t * reduced_values_out,
+  int64_t * arg_indices_out, size_t num_segments_in, size_t num_cols_in)
 {
   // Each thread processes exactly one row. It turned out that is more
   // efficient than using shared memory due to avoiding synchronization
   // barriers.
 
   int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int lane_idx = thread_idx % K;
-  if (thread_idx >= N * K) return;
+  int row_idx = thread_idx / num_cols_in;
+  int lane_idx = thread_idx % num_cols_in;
+  if (thread_idx >= num_segments_in * num_cols_in) return;
 
-  int offset = indptr_to_offset(indptr_size, indptr_dim, row_idx);
-  int64_t row_start = __ldg(indptr + offset);
-  int64_t row_end = __ldg(indptr + offset + 1);
+  int64_t row_start = __ldg(indptr_in + row_idx);
+  int64_t row_end = __ldg(indptr_in + row_idx + 1);
 
   scalar_t val = Reducer<scalar_t, REDUCE>::init();
-  int64_t arg;
+  int64_t arg{0};
 
-  offset = (row_idx / (indptr_size[indptr_dim - 1] - 1)) * E * K;
   for (int64_t src_idx = row_start; src_idx < row_end; src_idx++)
-    Reducer<scalar_t, REDUCE>::update(&val, src[offset + K * src_idx + lane_idx], &arg, src_idx);
+    Reducer<scalar_t, REDUCE>::update(
+      &val, src_in[num_cols_in * src_idx + lane_idx], &arg, src_idx);
 
-  if (arg_out != nullptr)
+  if (arg_indices_out != nullptr)
     Reducer<scalar_t, REDUCE>::write(
-      out + thread_idx, val, arg_out + thread_idx, arg, row_end - row_start);
+      reduced_values_out + thread_idx, val, arg_indices_out + thread_idx, arg, row_end - row_start);
   else
-    Reducer<scalar_t, REDUCE>::write(out + thread_idx, val, row_end - row_start);
+    Reducer<scalar_t, REDUCE>::write(reduced_values_out + thread_idx, val, row_end - row_start);
 }
 
 //! \todo test different devices (cudaSetDevice(src.get_device());)
 //! \todo expand index
 template <typename scalar_t, ReductionType REDUCE>
 int32_t segment_csr_launch(
-  const scalar_t * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,
-  const std::vector<int32_t> & indptr_size_in, const scalar_t * base_values_in,
-  scalar_t * reduced_values_out, int64_t * arg_indices_out, cudaStream_t stream_in)
+  const scalar_t * src_in, int32_t num_rows_in, int32_t num_cols_in, const int64_t * indptr_in,
+  int32_t indptr_size_in, scalar_t * reduced_values_out, int64_t * arg_indices_out,
+  cudaStream_t stream_in)
 {
-  if (indptr_size_in.empty() || src_size_in.size() < indptr_size_in.size()) return -1;
-
-  if (!std::equal(indptr_size_in.begin(), indptr_size_in.end() - 1, src_size_in.begin())) return -1;
+  if (num_rows_in < 0 || num_cols_in < 0 || indptr_size_in < 0) return -1;
 
-  auto dim = indptr_size_in.size() - 1;
-
-  auto _mul = [](int a, int b) { return a * b; };
-  auto src_numel = std::accumulate(src_size_in.begin(), src_size_in.end(), 1, _mul);
-  auto indptr_numel = std::accumulate(indptr_size_in.begin(), indptr_size_in.end(), 1, _mul);
-  auto out_numel = get_output_numel(src_size_in, indptr_size_in, dim);
+  auto num_segments = std::max<int32_t>(indptr_size_in - 1, 0);
+  auto out_numel = static_cast<size_t>(num_segments) * static_cast<size_t>(num_cols_in);
 
   if (out_numel == 0) return 0;
 
-  cudaMemcpyAsync(
-    reduced_values_out, base_values_in, sizeof(scalar_t) * out_numel, cudaMemcpyDeviceToDevice,
-    stream_in);
-
   if ((REDUCE == ReductionType::MIN || REDUCE == ReductionType::MAX) && arg_indices_out != nullptr)
-    fill_kernel<int64_t><<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(
-      arg_indices_out, out_numel, src_size_in[dim]);
-
-  if (src_numel == 0) return 0;
-
-  auto N = max(indptr_size_in[dim] - 1, 0) * (indptr_numel / indptr_size_in[dim]);
-  auto K = out_numel / N;
-  auto E = src_size_in[dim];
-  int64_t * indptr_size_dev;
-  cudaMallocAsync(&indptr_size_dev, sizeof(int64_t) * indptr_size_in.size(), stream_in);
-  cudaMemcpyAsync(
-    indptr_size_dev, indptr_size_in.data(), sizeof(int64_t) * indptr_size_in.size(),
-    cudaMemcpyHostToDevice, stream_in);
-
-  if (K == 1)
-    segment_csr_kernel<scalar_t, REDUCE, 1><<<BLOCKS(32, N), THREADS, 0, stream_in>>>(
-      src_in, indptr_in, indptr_size_dev, indptr_size_in.size(), reduced_values_out,
-      arg_indices_out, N, E);
-  else
-    segment_csr_broadcast_kernel<scalar_t, REDUCE><<<BLOCKS(1, N * K), THREADS, 0, stream_in>>>(
-      src_in, indptr_in, indptr_size_dev, indptr_size_in.size(), reduced_values_out,
-      arg_indices_out, N, K, E);
+    fill_kernel<int64_t>
+      <<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(arg_indices_out, out_numel, num_rows_in);
 
-  cudaFreeAsync(indptr_size_dev, stream_in);
-  return 0;
-}
-
-template <typename scalar_t, ReductionType REDUCE>
-int32_t segment_csr_launch(
-  const scalar_t * src_in, const std::vector<int32_t> & src_size_in, const int64_t * indptr_in,
-  const std::vector<int32_t> & indptr_size_in, scalar_t * reduced_values_out,
-  int64_t * arg_indices_out, cudaStream_t stream_in)
-{
-  if (indptr_size_in.empty() || src_size_in.size() < indptr_size_in.size()) return -1;
-
-  if (!std::equal(indptr_size_in.begin(), indptr_size_in.end() - 1, src_size_in.begin())) return -1;
-
-  auto dim = indptr_size_in.size() - 1;
-  auto out_numel = get_output_numel(src_size_in, indptr_size_in, dim);
-  if (out_numel == 0) return 0;
-
-  scalar_t * base_values;
+  scalar_t * base_values{nullptr};
   cudaMallocAsync(&base_values, sizeof(scalar_t) * out_numel, stream_in);
   fill_kernel<scalar_t><<<BLOCKS(1, out_numel), THREADS, 0, stream_in>>>(
     base_values, out_numel, static_cast<scalar_t>(0));
+  cudaMemcpyAsync(
+    reduced_values_out, base_values, sizeof(scalar_t) * out_numel, cudaMemcpyDeviceToDevice,
+    stream_in);
 
-  auto status = segment_csr_launch<scalar_t, REDUCE>(
-    src_in, src_size_in, indptr_in, indptr_size_in, base_values, reduced_values_out,
-    arg_indices_out, stream_in);
-  if (status != 0) {
-    cudaFreeAsync(base_values, stream_in);
-    return status;
-  }
+  if (num_cols_in == 1)
+    segment_csr_kernel<scalar_t, REDUCE, 1><<<BLOCKS(32, num_segments), THREADS, 0, stream_in>>>(
+      src_in, indptr_in, reduced_values_out, arg_indices_out, num_segments);
+  else
+    segment_csr_broadcast_kernel<scalar_t, REDUCE>
+      <<<BLOCKS(1, num_segments * num_cols_in), THREADS, 0, stream_in>>>(
+        src_in, indptr_in, reduced_values_out, arg_indices_out, num_segments, num_cols_in);
 
   cudaFreeAsync(base_values, stream_in);
   return 0;
diff --git a/perception/autoware_tensorrt_plugins/src/segment_csr_plugin.cpp b/perception/autoware_tensorrt_plugins/src/segment_csr_plugin.cpp
@@ -26,7 +26,6 @@
 #include <cstdint>
 #include <exception>
 #include <string>
-#include <vector>
 namespace nvinfer1::plugin
 {
 
@@ -177,9 +176,9 @@ std::int32_t SegmentCSRPlugin::enqueue(
   void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
   cudaStream_t stream) noexcept
 {
-  std::vector<int32_t> src_size{
-    static_cast<int32_t>(input_desc[0].dims.d[0]), static_cast<int32_t>(input_desc[0].dims.d[1])};
-  std::vector<int32_t> indptr_size{static_cast<int32_t>(input_desc[1].dims.d[0])};
+  const auto num_rows = static_cast<int32_t>(input_desc[0].dims.d[0]);
+  const auto num_cols = static_cast<int32_t>(input_desc[0].dims.d[1]);
+  const auto indptr_size = static_cast<int32_t>(input_desc[1].dims.d[0]);
 
   std::int32_t result = 0;
 
@@ -190,7 +189,7 @@ std::int32_t SegmentCSRPlugin::enqueue(
 
     AT_DISPATCH_REDUCTION_TYPES(reduce_, [&] {
       result = segment_csr_launch<float, REDUCE>(
-        src_ptr, src_size, indptr_ptr, indptr_size, out_ptr, nullptr, stream);
+        src_ptr, num_rows, num_cols, indptr_ptr, indptr_size, out_ptr, nullptr, stream);
     });
   } else if (input_desc[0].type == nvinfer1::DataType::kHALF) {
     const half * src_ptr = reinterpret_cast<const half *>(inputs[0]);
@@ -199,7 +198,7 @@ std::int32_t SegmentCSRPlugin::enqueue(
 
     AT_DISPATCH_REDUCTION_TYPES(reduce_, [&] {
       result = segment_csr_launch<half, REDUCE>(
-        src_ptr, src_size, indptr_ptr, indptr_size, out_ptr, nullptr, stream);
+        src_ptr, num_rows, num_cols, indptr_ptr, indptr_size, out_ptr, nullptr, stream);
     });
   }
 
diff --git a/perception/autoware_tensorrt_plugins/test/segment_csr_kernel_test.cpp b/perception/autoware_tensorrt_plugins/test/segment_csr_kernel_test.cpp