autowarefoundation · mojomex · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
@@ -147,6 +147,28 @@ if(TRT_AVAIL AND CUDA_AVAIL AND SPCONV_AVAIL)
       spconv::spconv
   )
 
+  if(BUILD_TESTING)
+    find_package(ament_cmake_gtest REQUIRED)
+
+    ament_add_gtest(reference_kernels_test
+      test/argsort_kernel_test.cpp
+      test/unique_kernel_test.cpp
+    )
+    if(TARGET reference_kernels_test)
+      target_link_libraries(reference_kernels_test
+        CUDA::cudart
+        cuda_ops
+      )
+      target_include_directories(reference_kernels_test PRIVATE
+        include
+        test
+        ${autoware_cuda_utils_INCLUDE_DIRS}
+        ${CUDA_INCLUDE_DIRS}
+      )
+      target_compile_definitions(reference_kernels_test PRIVATE _GLIBCXX_USE_CXX11_ABI=1)
+    endif()
+  endif()
+
   install(
     TARGETS ${PROJECT_NAME}
     DESTINATION share/${PROJECT_NAME}/plugins

@@ -20,6 +20,7 @@
   <depend>autoware_cuda_dependency_meta</depend>
   <depend>autoware_cuda_utils</depend>
 
+  <test_depend>ament_cmake_gtest</test_depend>
   <test_depend>ament_cmake_ros</test_depend>
   <test_depend>ament_lint_auto</test_depend>
   <test_depend>autoware_lint_common</test_depend>

@@ -157,8 +157,7 @@ std::int64_t unique(
     sorted_input;
 
   cudaMemcpyAsync(
-    range_ptr + num_out * sizeof(int64_t), &num_input_elements, sizeof(std::int64_t),
-    cudaMemcpyHostToDevice, stream);
+    range_ptr + num_out, &num_input_elements, sizeof(std::int64_t), cudaMemcpyHostToDevice, stream);
 
   thrust::adjacent_difference(policy, range_ptr + 1, range_ptr + num_out + 1, unique_counts);
 

@@ -0,0 +1,82 @@
+// Copyright 2026 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/argsort_ops/argsort.hpp"
+#include "test_utils.hpp"
+
+#include <autoware/cuda_utils/cuda_gtest_utils.hpp>
+
+#include <cuda_runtime_api.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace
+{
+
+using autoware::tensorrt_plugins::test::copy_to_device;
+using autoware::tensorrt_plugins::test::copy_to_host;
+using autoware::tensorrt_plugins::test::CudaStreamGuard;
+using autoware::tensorrt_plugins::test::DeviceBuffer;
+
+std::size_t get_argsort_total_workspace_size(const std::size_t num_elements)
+{
+  const auto temp_size = get_argsort_workspace_size(num_elements);
+  const auto scratch_offset =
+    ((temp_size + alignof(std::int64_t) - 1U) / alignof(std::int64_t)) * alignof(std::int64_t);
+  return scratch_offset + sizeof(std::int64_t) * 2U * num_elements;
+}
+
+std::vector<std::int64_t> make_argsort_reference(const std::vector<std::int64_t> & input)
+{
+  std::vector<std::int64_t> indices(input.size());
+  for (std::size_t index = 0; index < input.size(); ++index) {
+    indices[index] = static_cast<std::int64_t>(index);
+  }
+
+  std::stable_sort(
+    indices.begin(), indices.end(),
+    [&input](const std::int64_t lhs, const std::int64_t rhs) { return input[lhs] < input[rhs]; });
+
+  return indices;
+}
+
+TEST(ReferenceKernelsTest, ArgsortMatchesCpuReference)
+{
+  SKIP_TEST_IF_CUDA_UNAVAILABLE();
+
+  const std::vector<std::int64_t> input{7, 3, 7, 5, 3, 3, 9, 5, 11, 7};
+  const auto reference = make_argsort_reference(input);
+
+  CudaStreamGuard stream;
+  DeviceBuffer<std::int64_t> input_d(input.size());
+  DeviceBuffer<std::int64_t> output_d(input.size());
+  DeviceBuffer<std::uint8_t> workspace_d(get_argsort_total_workspace_size(input.size()));
+
+  copy_to_device(input_d.get(), input);
+
+  ASSERT_EQ(
+    argsort(
+      input_d.get(), output_d.get(), workspace_d.get(), input.size(),
+      get_argsort_workspace_size(input.size()), stream.get()),
+    cudaSuccess);
-  copy_to_device(input_d.get(), input);
-
-  ASSERT_EQ(
-    argsort(
-      input_d.get(), output_d.get(), workspace_d.get(), input.size(),
-      get_argsort_workspace_size(input.size()), stream.get()),
-    cudaSuccess);
+  cudaEvent_t copy_event;
+  cudaEventCreate(&copy_event);
+
+  copy_to_device(input_d.get(), input);
+
+  cudaEventRecord(copy_event, 0);  // record event on the default stream
+  cudaStreamWaitEvent(stream.get(), copy_event, cudaEventWaitDefault);
+
+  ASSERT_EQ(
+    argsort(
+      input_d.get(), output_d.get(), workspace_d.get(), input.size(),
+      get_argsort_workspace_size(input.size()), stream.get()),
+    cudaSuccess);
-  copy_to_device(input_d.get(), input);
-
-  ASSERT_EQ(
-    argsort(
-      input_d.get(), output_d.get(), workspace_d.get(), input.size(),
-      get_argsort_workspace_size(input.size()), stream.get()),
-    cudaSuccess);
+  cudaEvent_t copy_event;
+  cudaEventCreate(&copy_event);
+
+  copy_to_device(input_d.get(), input);
+
+  cudaEventRecord(copy_event, 0);  // record event on the default stream
+  cudaStreamWaitEvent(stream.get(), copy_event, cudaEventWaitDefault);
+
+  ASSERT_EQ(
+    argsort(
+      input_d.get(), output_d.get(), workspace_d.get(), input.size(),
+      get_argsort_workspace_size(input.size()), stream.get()),
+    cudaSuccess);
+  ASSERT_EQ(cudaStreamSynchronize(stream.get()), cudaSuccess);
+
+  EXPECT_EQ(copy_to_host(output_d.get(), input.size()), reference);
+}
+
+}  // namespace
@@ -0,0 +1,109 @@
+// Copyright 2026 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <stdexcept>
+#include <vector>
+
+namespace autoware::tensorrt_plugins::test
+{
+
+class CudaStreamGuard
+{
+public:
+  CudaStreamGuard()
+  {
+    const cudaError_t status = cudaStreamCreate(&stream_);
+    if (status != cudaSuccess) {
+      throw std::runtime_error(cudaGetErrorString(status));
+    }
+  }
+
+  ~CudaStreamGuard()
+  {
+    if (stream_ != nullptr) {
+      cudaStreamDestroy(stream_);
+    }
+  }
+
+  CudaStreamGuard(const CudaStreamGuard &) = delete;
+  CudaStreamGuard & operator=(const CudaStreamGuard &) = delete;
+  CudaStreamGuard(CudaStreamGuard &&) = delete;
+  CudaStreamGuard & operator=(CudaStreamGuard &&) = delete;
+
+  [[nodiscard]] cudaStream_t get() const { return stream_; }
+
+private:
+  cudaStream_t stream_{nullptr};
+};
+
+template <typename T>
+class DeviceBuffer
+{
+public:
+  explicit DeviceBuffer(std::size_t element_count) : element_count_(element_count)
+  {
+    const cudaError_t status = cudaMalloc(&data_, sizeof(T) * element_count_);
+    if (status != cudaSuccess) {
+      throw std::runtime_error(cudaGetErrorString(status));
+    }
+  }
+
+  ~DeviceBuffer()
+  {
+    if (data_ != nullptr) {
+      cudaFree(data_);
+    }
+  }
+
+  DeviceBuffer(const DeviceBuffer &) = delete;
+  DeviceBuffer & operator=(const DeviceBuffer &) = delete;
+  DeviceBuffer(DeviceBuffer &&) = delete;
+  DeviceBuffer & operator=(DeviceBuffer &&) = delete;
+
+  [[nodiscard]] T * get() const { return static_cast<T *>(data_); }
+
+private:
+  void * data_{nullptr};
+  std::size_t element_count_{0U};
+};
+
+template <typename T>
+void copy_to_device(T * device_ptr, const std::vector<T> & host_values)
+{
+  const cudaError_t status = cudaMemcpy(
+    device_ptr, host_values.data(), sizeof(T) * host_values.size(), cudaMemcpyHostToDevice);
+  if (status != cudaSuccess) {
+    throw std::runtime_error(cudaGetErrorString(status));
+  }
+}
+
+template <typename T>
+std::vector<T> copy_to_host(const T * device_ptr, const std::size_t element_count)
+{
+  std::vector<T> host_values(element_count);
+  const cudaError_t status =
+    cudaMemcpy(host_values.data(), device_ptr, sizeof(T) * element_count, cudaMemcpyDeviceToHost);
+  if (status != cudaSuccess) {
+    throw std::runtime_error(cudaGetErrorString(status));
+  }
+  return host_values;
+}
+
+}  // namespace autoware::tensorrt_plugins::test
@@ -0,0 +1,99 @@
+// Copyright 2026 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/unique_ops/unique.hpp"
+#include "test_utils.hpp"
+
+#include <autoware/cuda_utils/cuda_gtest_utils.hpp>
+
+#include <cuda_runtime_api.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <vector>
+
+namespace
+{
+
+using autoware::tensorrt_plugins::test::copy_to_device;
+using autoware::tensorrt_plugins::test::copy_to_host;
+using autoware::tensorrt_plugins::test::CudaStreamGuard;
+using autoware::tensorrt_plugins::test::DeviceBuffer;
+
+struct UniqueReference
+{
+  std::vector<std::int64_t> unique_values;
+  std::vector<std::int64_t> inverse_indices;
+  std::vector<std::int64_t> counts;
+};
+
+UniqueReference make_unique_reference(const std::vector<std::int64_t> & input)
+{
+  UniqueReference reference;
+  reference.unique_values = input;
+  std::sort(reference.unique_values.begin(), reference.unique_values.end());
+  reference.unique_values.erase(
+    std::unique(reference.unique_values.begin(), reference.unique_values.end()),
+    reference.unique_values.end());
+
+  std::map<std::int64_t, std::int64_t> value_to_index;
+  for (std::size_t index = 0; index < reference.unique_values.size(); ++index) {
+    value_to_index.emplace(reference.unique_values[index], static_cast<std::int64_t>(index));
+  }
+  reference.counts.resize(reference.unique_values.size(), 0);
+
+  reference.inverse_indices.reserve(input.size());
+  for (const auto value : input) {
+    const auto index = value_to_index.at(value);
+    reference.inverse_indices.push_back(index);
+    reference.counts[static_cast<std::size_t>(index)] += 1;
+  }
+
+  return reference;
+}
+
+TEST(ReferenceKernelsTest, UniqueMatchesCpuReference)
+{
+  SKIP_TEST_IF_CUDA_UNAVAILABLE();
+
+  const std::vector<std::int64_t> input{7, 3, 7, 5, 3, 3, 9, 5, 11, 7};
+  const UniqueReference reference = make_unique_reference(input);
+
+  CudaStreamGuard stream;
+  DeviceBuffer<std::int64_t> input_d(input.size());
+  DeviceBuffer<std::int64_t> unique_d(input.size());
+  DeviceBuffer<std::int64_t> inverse_d(input.size());
+  DeviceBuffer<std::int64_t> counts_d(input.size());
+  DeviceBuffer<std::uint8_t> workspace_d(get_unique_workspace_size(input.size()));
+
+  copy_to_device(input_d.get(), input);
+
+  const auto num_unique = unique(
+    input_d.get(), unique_d.get(), inverse_d.get(), counts_d.get(), workspace_d.get(), input.size(),
+    get_unique_workspace_size(input.size()), stream.get());
+  ASSERT_EQ(cudaStreamSynchronize(stream.get()), cudaSuccess);
+
+  const auto unique_values = copy_to_host(unique_d.get(), static_cast<std::size_t>(num_unique));
+  const auto inverse_indices = copy_to_host(inverse_d.get(), input.size());
+  const auto counts = copy_to_host(counts_d.get(), static_cast<std::size_t>(num_unique));
+
+  EXPECT_EQ(unique_values, reference.unique_values);
+  EXPECT_EQ(inverse_indices, reference.inverse_indices);
+  EXPECT_EQ(counts, reference.counts);
+}
+
+}  // namespace