Skip to content

Commit a2ce2c0

Browse files
author
Dan Toškan
committed
Implemented stream execution policy for thrust use cases
1 parent dbe0721 commit a2ce2c0

File tree

6 files changed

+25
-23
lines changed

6 files changed

+25
-23
lines changed

cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ namespace open3d {
1515
namespace core {
1616
void CUDAResetHeap(Tensor &heap) {
1717
uint32_t *heap_ptr = heap.GetDataPtr<uint32_t>();
18-
thrust::sequence(thrust::device, heap_ptr, heap_ptr + heap.GetLength(), 0);
18+
thrust::sequence(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), heap_ptr, heap_ptr + heap.GetLength(), 0);
1919
OPEN3D_CUDA_CHECK(cudaGetLastError());
20+
cuda::Synchronize(CUDAStream::GetInstance());
2021
}
2122
} // namespace core
2223
} // namespace open3d

cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,20 +200,21 @@ template <typename Key, typename Hash, typename Eq>
200200
std::vector<int64_t> SlabHashBackend<Key, Hash, Eq>::BucketSizes() const {
201201
CUDAScopedDevice scoped_device(this->device_);
202202
thrust::device_vector<int64_t> elems_per_bucket(impl_.bucket_count_);
203-
thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
203+
thrust::fill(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), elems_per_bucket.begin(), elems_per_bucket.end(), 0);
204204

205205
const int64_t num_blocks =
206206
(impl_.buffer_accessor_.capacity_ + kThreadsPerBlock - 1) /
207207
kThreadsPerBlock;
208208
CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock, 0,
209-
core::CUDAStream::GetInstance().Get()>>>(
209+
CUDAStream::GetInstance().Get()>>>(
210210
impl_, thrust::raw_pointer_cast(elems_per_bucket.data()));
211211
cuda::Synchronize(CUDAStream::GetInstance());
212212
OPEN3D_CUDA_CHECK(cudaGetLastError());
213213

214214
std::vector<int64_t> result(impl_.bucket_count_);
215-
thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
215+
thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), elems_per_bucket.begin(), elems_per_bucket.end(),
216216
result.begin());
217+
cuda::Synchronize(CUDAStream::GetInstance());
217218
return result;
218219
}
219220

@@ -236,8 +237,8 @@ void SlabHashBackend<Key, Hash, Eq>::Insert(
236237
/// Increase heap_top to pre-allocate potential memory increment and
237238
/// avoid atomicAdd in kernel.
238239
int prev_heap_top = this->buffer_->GetHeapTopIndex();
239-
*thrust::device_ptr<int>(impl_.buffer_accessor_.heap_top_) =
240-
prev_heap_top + count;
240+
int new_value = prev_heap_top + count;
241+
thrust::fill_n(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), thrust::device_pointer_cast(impl_.buffer_accessor_.heap_top_), 1, new_value);
241242

242243
const int64_t num_blocks =
243244
(count + kThreadsPerBlock - 1) / kThreadsPerBlock;
@@ -248,8 +249,9 @@ void SlabHashBackend<Key, Hash, Eq>::Insert(
248249
core::CUDAStream::GetInstance().Get()>>>(
249250
impl_, input_keys, output_buf_indices, output_masks, count);
250251

251-
thrust::device_vector<const void*> input_values_soa_device(
252-
input_values_soa.begin(), input_values_soa.end());
252+
thrust::device_vector<const void*> input_values_soa_device(input_values_soa.size());
253+
thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
254+
input_values_soa.begin(), input_values_soa.end(), input_values_soa_device.begin());
253255

254256
int64_t n_values = input_values_soa.size();
255257
const void* const* ptr_input_values_soa =

cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ class SlabNodeManager {
253253
const uint32_t num_super_blocks = kSuperBlocks;
254254

255255
thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
256-
thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
256+
thrust::fill(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), slabs_per_superblock.begin(), slabs_per_superblock.end(),
257257
0);
258258

259259
// Counting total number of allocated memory units.
@@ -264,13 +264,13 @@ class SlabNodeManager {
264264
num_cuda_blocks, kThreadsPerBlock, 0,
265265
core::CUDAStream::GetInstance().Get()>>>(
266266
impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
267-
cuda::Synchronize(CUDAStream::GetInstance());
268267
OPEN3D_CUDA_CHECK(cudaGetLastError());
269268

270269
std::vector<int> result(num_super_blocks);
271-
thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
272-
result.begin());
273-
270+
OPEN3D_CUDA_CHECK(cudaMemcpyAsync(result.data(), thrust::raw_pointer_cast(slabs_per_superblock.data()),num_super_blocks*sizeof(int),cudaMemcpyDeviceToHost, CUDAStream::GetInstance().Get()));
271+
if (!CUDAStream::GetInstance().IsDefaultStream()) {
272+
cuda::Synchronize(CUDAStream::GetInstance());
273+
}
274274
return result;
275275
}
276276

cpp/open3d/core/hashmap/CUDA/StdGPUHashBackend.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -366,11 +366,12 @@ void StdGPUHashBackend<Key, Hash, Eq>::Insert(
366366
CUDAScopedDevice scoped_device(this->device_);
367367
uint32_t threads = 128;
368368
uint32_t blocks = (count + threads - 1) / threads;
369+
int64_t n_values = input_values_soa.size();
369370

370-
thrust::device_vector<const void*> input_values_soa_device(
371-
input_values_soa.begin(), input_values_soa.end());
371+
thrust::device_vector<const void*> input_values_soa_device(n_values);
372+
thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
373+
input_values_soa.begin(), input_values_soa.end(), input_values_soa_device.begin());
372374

373-
int64_t n_values = input_values_soa.size();
374375
const void* const* ptr_input_values_soa =
375376
thrust::raw_pointer_cast(input_values_soa_device.data());
376377

cpp/open3d/core/kernel/NonZeroCUDA.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ Tensor NonZeroCUDA(const Tensor& src) {
6060
CUDAScopedDevice scoped_device(src.GetDevice());
6161
Tensor src_contiguous = src.Contiguous();
6262
const int64_t num_elements = src_contiguous.NumElements();
63-
const int64_t num_bytes =
64-
num_elements * src_contiguous.GetDtype().ByteSize();
6563

6664
thrust::counting_iterator<int64_t> index_first(0);
6765
thrust::counting_iterator<int64_t> index_last = index_first + num_elements;
@@ -72,9 +70,10 @@ Tensor NonZeroCUDA(const Tensor& src) {
7270
thrust::device_ptr<const scalar_t> src_ptr(
7371
static_cast<const scalar_t*>(src_contiguous.GetDataPtr()));
7472

75-
auto it = thrust::copy_if(index_first, index_last, src_ptr,
73+
auto it = thrust::copy_if(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), index_first, index_last, src_ptr,
7674
non_zero_indices.begin(),
7775
NonZeroFunctor<scalar_t>());
76+
cuda::Synchronize(CUDAStream::GetInstance());
7877
non_zero_indices.resize(thrust::distance(non_zero_indices.begin(), it));
7978
});
8079

@@ -88,13 +87,14 @@ Tensor NonZeroCUDA(const Tensor& src) {
8887
TensorIterator result_iter(result);
8988

9089
index_last = index_first + num_non_zeros;
91-
thrust::for_each(thrust::device,
90+
thrust::for_each(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
9291
thrust::make_zip_iterator(thrust::make_tuple(
9392
index_first, non_zero_indices.begin())),
9493
thrust::make_zip_iterator(thrust::make_tuple(
9594
index_last, non_zero_indices.end())),
9695
FlatIndexTransformFunctor(result_iter, num_non_zeros,
9796
num_dims, shape));
97+
cuda::Synchronize(CUDAStream::GetInstance());
9898

9999
return result;
100100
}

cpp/open3d/core/nns/FixedRadiusSearchImpl.cuh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -963,9 +963,7 @@ void FixedRadiusSearchCUDA(const cudaStream_t& stream,
963963
cudaMemcpyAsync(&last_prefix_sum_entry,
964964
query_neighbors_row_splits + num_queries,
965965
sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
966-
// wait for the async copies
967-
while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
968-
}
966+
cudaStreamSynchronize(stream);
969967
}
970968
mem_temp.Free(inclusive_scan_temp);
971969
}

0 commit comments

Comments
 (0)