Skip to content

Commit e20caa4

Browse files
committed
perf(autoware_tensorrt_plugins): keep plugin outputs stream-ordered
1 parent 971643a commit e20caa4

8 files changed

Lines changed: 82 additions & 41 deletions

File tree

perception/autoware_tensorrt_plugins/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class GetIndicesPairsImplicitGemmPlugin : public IPluginV3,
6363
GetIndicesPairsImplicitGemmPlugin(
6464
const std::string & name, GetIndicesPairsImplicitGemmParameters const & params);
6565

66-
~GetIndicesPairsImplicitGemmPlugin() override = default;
66+
~GetIndicesPairsImplicitGemmPlugin() override;
6767

6868
// IPluginV3 Methods
6969

@@ -131,6 +131,7 @@ class GetIndicesPairsImplicitGemmPlugin : public IPluginV3,
131131

132132
std::string layer_name_;
133133
GetIndicesPairsImplicitGemmParameters params_;
134+
std::int32_t * num_act_out_host_{nullptr};
134135
std::vector<nvinfer1::PluginField> data_to_serialize_;
135136
nvinfer1::PluginFieldCollection fc_to_serialize_;
136137
};

perception/autoware_tensorrt_plugins/include/autoware/tensorrt_plugins/get_indices_pairs_plugin.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class GetIndicesPairsPlugin : public IPluginV3,
6161
public:
6262
GetIndicesPairsPlugin(const std::string & name, GetIndicesPairsParameters const & params);
6363

64-
~GetIndicesPairsPlugin() override = default;
64+
~GetIndicesPairsPlugin() override;
6565

6666
// IPluginV3 Methods
6767

@@ -125,6 +125,7 @@ class GetIndicesPairsPlugin : public IPluginV3,
125125

126126
std::string layer_name_;
127127
GetIndicesPairsParameters params_;
128+
std::int32_t * num_act_out_host_{nullptr};
128129
std::vector<nvinfer1::PluginField> data_to_serialize_;
129130
nvinfer1::PluginFieldCollection fc_to_serialize_;
130131
};

perception/autoware_tensorrt_plugins/include/autoware/unique_ops/unique.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919

2020
#include <cstdint>
2121

22-
std::int64_t unique(
22+
cudaError_t unique(
2323
const std::int64_t * input, std::int64_t * unique, std::int64_t * inverse_indices,
24-
std::int64_t * unique_counts, void * workspace, std::size_t num_input_elements,
25-
std::size_t workspace_size, cudaStream_t stream);
24+
std::int64_t * unique_counts, std::int64_t * num_unique, void * workspace,
25+
std::size_t num_input_elements, std::size_t workspace_size, cudaStream_t stream);
2626

2727
std::size_t get_unique_temp_storage_size(std::size_t num_elements);
2828
std::size_t get_unique_workspace_size(std::size_t num_elements);

perception/autoware_tensorrt_plugins/src/get_indices_pairs_implicit_gemm_plugin.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,16 @@ GetIndicesPairsImplicitGemmPlugin::GetIndicesPairsImplicitGemmPlugin(
4343
: layer_name_{name}, params_{params}
4444
{
4545
initFieldsToSerialize();
46+
PLUGIN_ASSERT(
47+
cudaMallocHost(reinterpret_cast<void **>(&num_act_out_host_), sizeof(std::int32_t)) ==
48+
cudaSuccess);
49+
}
50+
51+
GetIndicesPairsImplicitGemmPlugin::~GetIndicesPairsImplicitGemmPlugin()
52+
{
53+
if (num_act_out_host_ != nullptr) {
54+
cudaFreeHost(num_act_out_host_);
55+
}
4656
}
4757

4858
void GetIndicesPairsImplicitGemmPlugin::initFieldsToSerialize()
@@ -437,9 +447,10 @@ std::int32_t GetIndicesPairsImplicitGemmPlugin::enqueue(
437447

438448
std::int32_t num_act_out_real = std::get<1>(pair_res);
439449
std::int32_t * num_act_out_data = static_cast<std::int32_t *>(outputs[4]);
450+
*num_act_out_host_ = num_act_out_real;
440451

441452
cudaError_t const status = cudaMemcpyAsync(
442-
num_act_out_data, &num_act_out_real, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
453+
num_act_out_data, num_act_out_host_, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
443454

444455
return status;
445456
}

perception/autoware_tensorrt_plugins/src/get_indices_pairs_plugin.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,16 @@ GetIndicesPairsPlugin::GetIndicesPairsPlugin(
4040
: layer_name_{name}, params_{params}
4141
{
4242
initFieldsToSerialize();
43+
PLUGIN_ASSERT(
44+
cudaMallocHost(reinterpret_cast<void **>(&num_act_out_host_), sizeof(std::int32_t)) ==
45+
cudaSuccess);
46+
}
47+
48+
GetIndicesPairsPlugin::~GetIndicesPairsPlugin()
49+
{
50+
if (num_act_out_host_ != nullptr) {
51+
cudaFreeHost(num_act_out_host_);
52+
}
4353
}
4454

4555
void GetIndicesPairsPlugin::initFieldsToSerialize()
@@ -290,11 +300,10 @@ std::int32_t GetIndicesPairsPlugin::enqueue(
290300
}
291301

292302
std::int32_t * num_act_out_data = static_cast<std::int32_t *>(outputs[3]);
303+
*num_act_out_host_ = num_act_out_real;
293304

294305
cudaError_t const status = cudaMemcpyAsync(
295-
num_act_out_data, &num_act_out_real, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
296-
297-
cudaStreamSynchronize(stream);
306+
num_act_out_data, num_act_out_host_, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
298307

299308
return status;
300309
}

perception/autoware_tensorrt_plugins/src/unique_ops/unique.cu

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,14 @@ __global__ void write_unique_counts(
188188

189189
} // namespace
190190

191-
std::int64_t unique(
191+
cudaError_t unique(
192192
const std::int64_t * input, std::int64_t * unique, std::int64_t * inverse_indices,
193-
std::int64_t * unique_counts, void * workspace, std::size_t num_input_elements,
194-
std::size_t unique_workspace_size, cudaStream_t stream)
193+
std::int64_t * unique_counts, std::int64_t * num_unique, void * workspace,
194+
std::size_t num_input_elements, std::size_t unique_workspace_size, cudaStream_t stream)
195195
{
196+
(void)unique_workspace_size;
196197
if (num_input_elements == 0U) {
197-
return 0;
198+
return cudaMemsetAsync(num_unique, 0, sizeof(std::int64_t), stream);
198199
}
199200

200201
const auto temp_storage_size = get_unique_temp_storage_size(num_input_elements);
@@ -204,38 +205,61 @@ std::int64_t unique(
204205
auto * input_positions = reinterpret_cast<std::int64_t *>(scratch);
205206
auto * sorted_input = input_positions + num_input_elements;
206207
auto * unique_offsets = sorted_input + num_input_elements;
207-
auto * num_unique_d = unique_offsets + num_input_elements + 1U;
208-
auto * run_ids = reinterpret_cast<std::int32_t *>(num_unique_d + 1U);
208+
auto * run_ids = reinterpret_cast<std::int32_t *>(unique_offsets + num_input_elements + 1U);
209209

210210
const auto num_blocks =
211211
static_cast<unsigned int>((num_input_elements + kThreadsPerBlock - 1U) / kThreadsPerBlock);
212212

213213
fill_iota<<<num_blocks, kThreadsPerBlock, 0, stream>>>(input_positions, num_input_elements);
214-
cub::DeviceRadixSort::SortPairs(
214+
cudaError_t status = cudaGetLastError();
215+
if (status != cudaSuccess) {
216+
return status;
217+
}
218+
219+
status = cub::DeviceRadixSort::SortPairs(
215220
workspace, temp_storage_size, input, sorted_input, input_positions, unique_offsets,
216221
num_input_elements, 0, 64, stream);
222+
if (status != cudaSuccess) {
223+
return status;
224+
}
217225

218226
mark_run_starts<<<num_blocks, kThreadsPerBlock, 0, stream>>>(
219227
sorted_input, run_ids, num_input_elements);
220-
cub::DeviceScan::InclusiveSum(
228+
status = cudaGetLastError();
229+
if (status != cudaSuccess) {
230+
return status;
231+
}
232+
233+
status = cub::DeviceScan::InclusiveSum(
221234
workspace, temp_storage_size, run_ids, run_ids, num_input_elements, stream);
235+
if (status != cudaSuccess) {
236+
return status;
237+
}
222238

223239
scatter_inverse_indices<<<num_blocks, kThreadsPerBlock, 0, stream>>>(
224240
unique_offsets, run_ids, inverse_indices, num_input_elements);
241+
status = cudaGetLastError();
242+
if (status != cudaSuccess) {
243+
return status;
244+
}
225245

226-
cub::DeviceSelect::UniqueByKey(
246+
status = cub::DeviceSelect::UniqueByKey(
227247
workspace, temp_storage_size, sorted_input, input_positions, unique, unique_offsets,
228-
num_unique_d, num_input_elements, stream);
248+
num_unique, num_input_elements, stream);
249+
if (status != cudaSuccess) {
250+
return status;
251+
}
229252

230253
write_unique_offset_sentinel<<<1, 1, 0, stream>>>(
231-
unique_offsets, num_unique_d, num_input_elements);
232-
write_unique_counts<<<num_blocks, kThreadsPerBlock, 0, stream>>>(
233-
unique_offsets, num_unique_d, unique_counts);
254+
unique_offsets, num_unique, num_input_elements);
255+
status = cudaGetLastError();
256+
if (status != cudaSuccess) {
257+
return status;
258+
}
234259

235-
std::int64_t num_out = 0;
236-
cudaMemcpyAsync(&num_out, num_unique_d, sizeof(std::int64_t), cudaMemcpyDeviceToHost, stream);
237-
cudaStreamSynchronize(stream);
238-
return num_out;
260+
write_unique_counts<<<num_blocks, kThreadsPerBlock, 0, stream>>>(
261+
unique_offsets, num_unique, unique_counts);
262+
return cudaGetLastError();
239263
}
240264

241265
std::size_t get_unique_temp_storage_size(std::size_t num_elements)
@@ -247,6 +271,6 @@ std::size_t get_unique_workspace_size(std::size_t num_elements)
247271
{
248272
const auto temp_size = query_unique_temp_storage_size(num_elements);
249273
const auto scratch_offset = align_up(temp_size, alignof(std::int64_t));
250-
return scratch_offset + (3 * num_elements + 2U) * sizeof(std::int64_t) +
274+
return scratch_offset + (3 * num_elements + 1U) * sizeof(std::int64_t) +
251275
num_elements * sizeof(std::int32_t);
252276
}

perception/autoware_tensorrt_plugins/src/unique_plugin.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -165,19 +165,10 @@ std::int32_t UniquePlugin::enqueue(
165165
{
166166
std::int64_t num_elements = input_desc[0].dims.d[0];
167167
const auto workspace_size = get_unique_workspace_size(static_cast<std::size_t>(num_elements));
168-
169-
std::int64_t num_unique_elements = unique(
168+
return unique(
170169
reinterpret_cast<const std::int64_t *>(inputs[0]), reinterpret_cast<std::int64_t *>(outputs[0]),
171170
reinterpret_cast<std::int64_t *>(outputs[1]), reinterpret_cast<std::int64_t *>(outputs[2]),
172-
workspace, num_elements, workspace_size, stream);
173-
174-
cudaMemcpyAsync(
175-
reinterpret_cast<std::int64_t *>(outputs[3]), &num_unique_elements, sizeof(std::int64_t),
176-
cudaMemcpyHostToDevice, stream);
177-
178-
cudaStreamSynchronize(stream);
179-
180-
return 0;
171+
reinterpret_cast<std::int64_t *>(outputs[3]), workspace, num_elements, workspace_size, stream);
181172
}
182173

183174
std::int32_t UniquePlugin::onShapeChange(

perception/autoware_tensorrt_plugins/test/reference_kernels_test.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -261,15 +261,19 @@ TEST(ReferenceKernelsTest, UniqueMatchesCpuReference)
261261
DeviceBuffer<std::int64_t> unique_d(input.size());
262262
DeviceBuffer<std::int64_t> inverse_d(input.size());
263263
DeviceBuffer<std::int64_t> counts_d(input.size());
264+
DeviceBuffer<std::int64_t> num_unique_d(1U);
264265
DeviceBuffer<std::uint8_t> workspace_d(get_unique_workspace_size(input.size()));
265266

266267
copyToDevice(input_d.get(), input);
267268

268-
const auto num_unique = unique(
269-
input_d.get(), unique_d.get(), inverse_d.get(), counts_d.get(), workspace_d.get(), input.size(),
270-
get_unique_workspace_size(input.size()), stream.get());
269+
ASSERT_EQ(
270+
unique(
271+
input_d.get(), unique_d.get(), inverse_d.get(), counts_d.get(), num_unique_d.get(),
272+
workspace_d.get(), input.size(), get_unique_workspace_size(input.size()), stream.get()),
273+
cudaSuccess);
271274
ASSERT_EQ(cudaStreamSynchronize(stream.get()), cudaSuccess);
272275

276+
const auto num_unique = copyToHost(num_unique_d.get(), 1U).front();
273277
const auto unique_values = copyToHost(unique_d.get(), static_cast<std::size_t>(num_unique));
274278
const auto inverse_indices = copyToHost(inverse_d.get(), input.size());
275279
const auto counts = copyToHost(counts_d.get(), static_cast<std::size_t>(num_unique));

0 commit comments

Comments
 (0)