Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions perception/autoware_tensorrt_plugins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,28 @@ if(TRT_AVAIL AND CUDA_AVAIL AND SPCONV_AVAIL)
spconv::spconv
)

if(BUILD_TESTING)
find_package(ament_cmake_gtest REQUIRED)

ament_add_gtest(reference_kernels_test
test/argsort_kernel_test.cpp
test/unique_kernel_test.cpp
)
if(TARGET reference_kernels_test)
target_link_libraries(reference_kernels_test
CUDA::cudart
cuda_ops
)
target_include_directories(reference_kernels_test PRIVATE
include
test
${autoware_cuda_utils_INCLUDE_DIRS}
${CUDA_INCLUDE_DIRS}
)
target_compile_definitions(reference_kernels_test PRIVATE _GLIBCXX_USE_CXX11_ABI=1)
endif()
endif()

install(
TARGETS ${PROJECT_NAME}
DESTINATION share/${PROJECT_NAME}/plugins
Expand Down
1 change: 1 addition & 0 deletions perception/autoware_tensorrt_plugins/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
<depend>autoware_cuda_dependency_meta</depend>
<depend>autoware_cuda_utils</depend>

<test_depend>ament_cmake_gtest</test_depend>
<test_depend>ament_cmake_ros</test_depend>
<test_depend>ament_lint_auto</test_depend>
<test_depend>autoware_lint_common</test_depend>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,7 @@ std::int64_t unique(
sorted_input;

cudaMemcpyAsync(
range_ptr + num_out * sizeof(int64_t), &num_input_elements, sizeof(std::int64_t),
cudaMemcpyHostToDevice, stream);
range_ptr + num_out, &num_input_elements, sizeof(std::int64_t), cudaMemcpyHostToDevice, stream);

thrust::adjacent_difference(policy, range_ptr + 1, range_ptr + num_out + 1, unique_counts);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright 2026 TIER IV, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "autoware/argsort_ops/argsort.hpp"
#include "test_utils.hpp"

#include <autoware/cuda_utils/cuda_gtest_utils.hpp>

#include <cuda_runtime_api.h>
#include <gtest/gtest.h>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <vector>

namespace
{

using autoware::tensorrt_plugins::test::copy_to_device;
using autoware::tensorrt_plugins::test::copy_to_host;
using autoware::tensorrt_plugins::test::CudaStreamGuard;
using autoware::tensorrt_plugins::test::DeviceBuffer;

std::size_t get_argsort_total_workspace_size(const std::size_t num_elements)
{
const auto temp_size = get_argsort_workspace_size(num_elements);
const auto scratch_offset =
((temp_size + alignof(std::int64_t) - 1U) / alignof(std::int64_t)) * alignof(std::int64_t);
return scratch_offset + sizeof(std::int64_t) * 2U * num_elements;
}

std::vector<std::int64_t> make_argsort_reference(const std::vector<std::int64_t> & input)
{
std::vector<std::int64_t> indices(input.size());
for (std::size_t index = 0; index < input.size(); ++index) {
indices[index] = static_cast<std::int64_t>(index);
}

std::stable_sort(
indices.begin(), indices.end(),
[&input](const std::int64_t lhs, const std::int64_t rhs) { return input[lhs] < input[rhs]; });

return indices;
}

TEST(ReferenceKernelsTest, ArgsortMatchesCpuReference)
{
SKIP_TEST_IF_CUDA_UNAVAILABLE();

const std::vector<std::int64_t> input{7, 3, 7, 5, 3, 3, 9, 5, 11, 7};
const auto reference = make_argsort_reference(input);

CudaStreamGuard stream;
DeviceBuffer<std::int64_t> input_d(input.size());
DeviceBuffer<std::int64_t> output_d(input.size());
DeviceBuffer<std::uint8_t> workspace_d(get_argsort_total_workspace_size(input.size()));

copy_to_device(input_d.get(), input);

ASSERT_EQ(
argsort(
input_d.get(), output_d.get(), workspace_d.get(), input.size(),
get_argsort_workspace_size(input.size()), stream.get()),
cudaSuccess);
Comment on lines +70 to +76
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
copy_to_device(input_d.get(), input);
ASSERT_EQ(
argsort(
input_d.get(), output_d.get(), workspace_d.get(), input.size(),
get_argsort_workspace_size(input.size()), stream.get()),
cudaSuccess);
cudaEvent_t copy_event;
cudaEventCreate(&copy_event);
copy_to_device(input_d.get(), input);
cudaEventRecord(copy_event, 0); // record event on the default stream
cudaStreamWaitEvent(stream.get(), copy_event, cudaEventWaitDefault);
ASSERT_EQ(
argsort(
input_d.get(), output_d.get(), workspace_d.get(), input.size(),
get_argsort_workspace_size(input.size()), stream.get()),
cudaSuccess);

According to the memcpy synchronous behavior:

For transfers from pageable host memory to device memory, a stream sync is performed before the copy is initiated. The function will return once the pageable buffer has been copied to the staging memory for DMA transfer to device memory, but the DMA to final destination may not have completed.

Since input is pageable host memory, it would be better to insert explicit synchronization before operating on input_d.

Alternatively, we can skip this kind of explicit synchronization if copy_to_device takes a CUDA stream as an argument and performs cudaMemcpyAsync inside.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a stream sync is performed before the copy is initiated
That means that cudaMemcpy already syncs afaik, so we are already fine, right? Please correct me if I'm wrong.

ASSERT_EQ(cudaStreamSynchronize(stream.get()), cudaSuccess);

EXPECT_EQ(copy_to_host(output_d.get(), input.size()), reference);
}

} // namespace
109 changes: 109 additions & 0 deletions perception/autoware_tensorrt_plugins/test/test_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright 2026 TIER IV, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <cuda_runtime_api.h>
#include <gtest/gtest.h>

#include <cstddef>
#include <stdexcept>
#include <vector>

namespace autoware::tensorrt_plugins::test
{

class CudaStreamGuard
{
public:
CudaStreamGuard()
{
const cudaError_t status = cudaStreamCreate(&stream_);
if (status != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(status));
}
}

~CudaStreamGuard()
{
if (stream_ != nullptr) {
cudaStreamDestroy(stream_);
}
}

CudaStreamGuard(const CudaStreamGuard &) = delete;
CudaStreamGuard & operator=(const CudaStreamGuard &) = delete;
CudaStreamGuard(CudaStreamGuard &&) = delete;
CudaStreamGuard & operator=(CudaStreamGuard &&) = delete;

[[nodiscard]] cudaStream_t get() const { return stream_; }

private:
cudaStream_t stream_{nullptr};
};

template <typename T>
class DeviceBuffer
{
public:
explicit DeviceBuffer(std::size_t element_count) : element_count_(element_count)
{
const cudaError_t status = cudaMalloc(&data_, sizeof(T) * element_count_);
if (status != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(status));
}
}

~DeviceBuffer()
{
if (data_ != nullptr) {
cudaFree(data_);
}
}

DeviceBuffer(const DeviceBuffer &) = delete;
DeviceBuffer & operator=(const DeviceBuffer &) = delete;
DeviceBuffer(DeviceBuffer &&) = delete;
DeviceBuffer & operator=(DeviceBuffer &&) = delete;

[[nodiscard]] T * get() const { return static_cast<T *>(data_); }

private:
void * data_{nullptr};
std::size_t element_count_{0U};
};

template <typename T>
void copy_to_device(T * device_ptr, const std::vector<T> & host_values)
{
const cudaError_t status = cudaMemcpy(
device_ptr, host_values.data(), sizeof(T) * host_values.size(), cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(status));
}
}

template <typename T>
std::vector<T> copy_to_host(const T * device_ptr, const std::size_t element_count)
{
std::vector<T> host_values(element_count);
const cudaError_t status =
cudaMemcpy(host_values.data(), device_ptr, sizeof(T) * element_count, cudaMemcpyDeviceToHost);
if (status != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(status));
}
return host_values;
}

} // namespace autoware::tensorrt_plugins::test
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2026 TIER IV, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "autoware/unique_ops/unique.hpp"
#include "test_utils.hpp"

#include <autoware/cuda_utils/cuda_gtest_utils.hpp>

#include <cuda_runtime_api.h>
#include <gtest/gtest.h>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <map>
#include <vector>

namespace
{

using autoware::tensorrt_plugins::test::copy_to_device;
using autoware::tensorrt_plugins::test::copy_to_host;
using autoware::tensorrt_plugins::test::CudaStreamGuard;
using autoware::tensorrt_plugins::test::DeviceBuffer;

struct UniqueReference
{
std::vector<std::int64_t> unique_values;
std::vector<std::int64_t> inverse_indices;
std::vector<std::int64_t> counts;
};

UniqueReference make_unique_reference(const std::vector<std::int64_t> & input)
{
UniqueReference reference;
reference.unique_values = input;
std::sort(reference.unique_values.begin(), reference.unique_values.end());
reference.unique_values.erase(
std::unique(reference.unique_values.begin(), reference.unique_values.end()),
reference.unique_values.end());

std::map<std::int64_t, std::int64_t> value_to_index;
for (std::size_t index = 0; index < reference.unique_values.size(); ++index) {
value_to_index.emplace(reference.unique_values[index], static_cast<std::int64_t>(index));
}
reference.counts.resize(reference.unique_values.size(), 0);

reference.inverse_indices.reserve(input.size());
for (const auto value : input) {
const auto index = value_to_index.at(value);
reference.inverse_indices.push_back(index);
reference.counts[static_cast<std::size_t>(index)] += 1;
}

return reference;
}

TEST(ReferenceKernelsTest, UniqueMatchesCpuReference)
{
SKIP_TEST_IF_CUDA_UNAVAILABLE();

const std::vector<std::int64_t> input{7, 3, 7, 5, 3, 3, 9, 5, 11, 7};
const UniqueReference reference = make_unique_reference(input);

CudaStreamGuard stream;
DeviceBuffer<std::int64_t> input_d(input.size());
DeviceBuffer<std::int64_t> unique_d(input.size());
DeviceBuffer<std::int64_t> inverse_d(input.size());
DeviceBuffer<std::int64_t> counts_d(input.size());
DeviceBuffer<std::uint8_t> workspace_d(get_unique_workspace_size(input.size()));

copy_to_device(input_d.get(), input);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as the the case of argsort-_kernel_test.cpp. better to insert explicit sync

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above reply.

const auto num_unique = unique(
input_d.get(), unique_d.get(), inverse_d.get(), counts_d.get(), workspace_d.get(), input.size(),
get_unique_workspace_size(input.size()), stream.get());
ASSERT_EQ(cudaStreamSynchronize(stream.get()), cudaSuccess);

const auto unique_values = copy_to_host(unique_d.get(), static_cast<std::size_t>(num_unique));
const auto inverse_indices = copy_to_host(inverse_d.get(), input.size());
const auto counts = copy_to_host(counts_d.get(), static_cast<std::size_t>(num_unique));

EXPECT_EQ(unique_values, reference.unique_values);
EXPECT_EQ(inverse_indices, reference.inverse_indices);
EXPECT_EQ(counts, reference.counts);
}

} // namespace
Loading