Implemented stream execution policy for thrust use cases

Dan Toškan · Dan Toškan · commit a2ce2c069e87 · 2025-11-03T11:09:27.000+01:00
diff --git a/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu b/cpp/open3d/core/hashmap/CUDA/CUDAHashBackendBuffer.cu
@@ -15,8 +15,9 @@ namespace open3d {
 namespace core {
 void CUDAResetHeap(Tensor &heap) {
     uint32_t *heap_ptr = heap.GetDataPtr<uint32_t>();
-    thrust::sequence(thrust::device, heap_ptr, heap_ptr + heap.GetLength(), 0);
+    thrust::sequence(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), heap_ptr, heap_ptr + heap.GetLength(), 0);
     OPEN3D_CUDA_CHECK(cudaGetLastError());
+    cuda::Synchronize(CUDAStream::GetInstance());
 }
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h b/cpp/open3d/core/hashmap/CUDA/SlabHashBackend.h
@@ -200,20 +200,21 @@ template <typename Key, typename Hash, typename Eq>
 std::vector<int64_t> SlabHashBackend<Key, Hash, Eq>::BucketSizes() const {
     CUDAScopedDevice scoped_device(this->device_);
     thrust::device_vector<int64_t> elems_per_bucket(impl_.bucket_count_);
-    thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
+    thrust::fill(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), elems_per_bucket.begin(), elems_per_bucket.end(), 0);
 
     const int64_t num_blocks =
             (impl_.buffer_accessor_.capacity_ + kThreadsPerBlock - 1) /
             kThreadsPerBlock;
     CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock, 0,
-                                core::CUDAStream::GetInstance().Get()>>>(
+                                CUDAStream::GetInstance().Get()>>>(
             impl_, thrust::raw_pointer_cast(elems_per_bucket.data()));
     cuda::Synchronize(CUDAStream::GetInstance());
     OPEN3D_CUDA_CHECK(cudaGetLastError());
 
     std::vector<int64_t> result(impl_.bucket_count_);
-    thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
+    thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), elems_per_bucket.begin(), elems_per_bucket.end(),
                  result.begin());
+    cuda::Synchronize(CUDAStream::GetInstance());
     return result;
 }
 
@@ -236,8 +237,8 @@ void SlabHashBackend<Key, Hash, Eq>::Insert(
     /// Increase heap_top to pre-allocate potential memory increment and
     /// avoid atomicAdd in kernel.
     int prev_heap_top = this->buffer_->GetHeapTopIndex();
-    *thrust::device_ptr<int>(impl_.buffer_accessor_.heap_top_) =
-            prev_heap_top + count;
+    int new_value = prev_heap_top + count;
+    thrust::fill_n(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), thrust::device_pointer_cast(impl_.buffer_accessor_.heap_top_), 1, new_value);
 
     const int64_t num_blocks =
             (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
@@ -248,8 +249,9 @@ void SlabHashBackend<Key, Hash, Eq>::Insert(
                         core::CUDAStream::GetInstance().Get()>>>(
             impl_, input_keys, output_buf_indices, output_masks, count);
 
-    thrust::device_vector<const void*> input_values_soa_device(
-            input_values_soa.begin(), input_values_soa.end());
+    thrust::device_vector<const void*> input_values_soa_device(input_values_soa.size());
+    thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
+            input_values_soa.begin(), input_values_soa.end(), input_values_soa_device.begin());
 
     int64_t n_values = input_values_soa.size();
     const void* const* ptr_input_values_soa =
diff --git a/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h b/cpp/open3d/core/hashmap/CUDA/SlabNodeManager.h
@@ -253,7 +253,7 @@ class SlabNodeManager {
         const uint32_t num_super_blocks = kSuperBlocks;
 
         thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
-        thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
+        thrust::fill(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), slabs_per_superblock.begin(), slabs_per_superblock.end(),
                      0);
 
         // Counting total number of allocated memory units.
@@ -264,13 +264,13 @@ class SlabNodeManager {
                 num_cuda_blocks, kThreadsPerBlock, 0,
                 core::CUDAStream::GetInstance().Get()>>>(
                 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
-        cuda::Synchronize(CUDAStream::GetInstance());
         OPEN3D_CUDA_CHECK(cudaGetLastError());
 
         std::vector<int> result(num_super_blocks);
-        thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
-                     result.begin());
-
+        OPEN3D_CUDA_CHECK(cudaMemcpyAsync(result.data(), thrust::raw_pointer_cast(slabs_per_superblock.data()),num_super_blocks*sizeof(int),cudaMemcpyDeviceToHost, CUDAStream::GetInstance().Get()));
+        if (!CUDAStream::GetInstance().IsDefaultStream()) {
+            cuda::Synchronize(CUDAStream::GetInstance());
+        }
         return result;
     }
 
diff --git a/cpp/open3d/core/hashmap/CUDA/StdGPUHashBackend.h b/cpp/open3d/core/hashmap/CUDA/StdGPUHashBackend.h
@@ -366,11 +366,12 @@ void StdGPUHashBackend<Key, Hash, Eq>::Insert(
     CUDAScopedDevice scoped_device(this->device_);
     uint32_t threads = 128;
     uint32_t blocks = (count + threads - 1) / threads;
+    int64_t n_values = input_values_soa.size();
 
-    thrust::device_vector<const void*> input_values_soa_device(
-            input_values_soa.begin(), input_values_soa.end());
+    thrust::device_vector<const void*> input_values_soa_device(n_values);
+    thrust::copy(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
+            input_values_soa.begin(), input_values_soa.end(), input_values_soa_device.begin());
 
-    int64_t n_values = input_values_soa.size();
     const void* const* ptr_input_values_soa =
             thrust::raw_pointer_cast(input_values_soa_device.data());
 
diff --git a/cpp/open3d/core/kernel/NonZeroCUDA.cu b/cpp/open3d/core/kernel/NonZeroCUDA.cu
@@ -60,8 +60,6 @@ Tensor NonZeroCUDA(const Tensor& src) {
     CUDAScopedDevice scoped_device(src.GetDevice());
     Tensor src_contiguous = src.Contiguous();
     const int64_t num_elements = src_contiguous.NumElements();
-    const int64_t num_bytes =
-            num_elements * src_contiguous.GetDtype().ByteSize();
 
     thrust::counting_iterator<int64_t> index_first(0);
     thrust::counting_iterator<int64_t> index_last = index_first + num_elements;
@@ -72,9 +70,10 @@ Tensor NonZeroCUDA(const Tensor& src) {
         thrust::device_ptr<const scalar_t> src_ptr(
                 static_cast<const scalar_t*>(src_contiguous.GetDataPtr()));
 
-        auto it = thrust::copy_if(index_first, index_last, src_ptr,
+        auto it = thrust::copy_if(thrust::cuda::par.on(CUDAStream::GetInstance().Get()), index_first, index_last, src_ptr,
                                   non_zero_indices.begin(),
                                   NonZeroFunctor<scalar_t>());
+        cuda::Synchronize(CUDAStream::GetInstance());
         non_zero_indices.resize(thrust::distance(non_zero_indices.begin(), it));
     });
 
@@ -88,13 +87,14 @@ Tensor NonZeroCUDA(const Tensor& src) {
     TensorIterator result_iter(result);
 
     index_last = index_first + num_non_zeros;
-    thrust::for_each(thrust::device,
+    thrust::for_each(thrust::cuda::par.on(CUDAStream::GetInstance().Get()),
                      thrust::make_zip_iterator(thrust::make_tuple(
                              index_first, non_zero_indices.begin())),
                      thrust::make_zip_iterator(thrust::make_tuple(
                              index_last, non_zero_indices.end())),
                      FlatIndexTransformFunctor(result_iter, num_non_zeros,
                                                num_dims, shape));
+    cuda::Synchronize(CUDAStream::GetInstance());
 
     return result;
 }
diff --git a/cpp/open3d/core/nns/FixedRadiusSearchImpl.cuh b/cpp/open3d/core/nns/FixedRadiusSearchImpl.cuh
@@ -963,9 +963,7 @@ void FixedRadiusSearchCUDA(const cudaStream_t& stream,
             cudaMemcpyAsync(&last_prefix_sum_entry,
                             query_neighbors_row_splits + num_queries,
                             sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
-            // wait for the async copies
-            while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
-            }
+            cudaStreamSynchronize(stream);
         }
         mem_temp.Free(inclusive_scan_temp);
     }

Original file line number	Diff line number	Diff line change
`@@ -963,9 +963,7 @@ void FixedRadiusSearchCUDA(const cudaStream_t& stream,`
`963`	`963`	`cudaMemcpyAsync(&last_prefix_sum_entry,`
`964`	`964`	`query_neighbors_row_splits + num_queries,`
`965`	`965`	`sizeof(int64_t), cudaMemcpyDeviceToHost, stream);`
`966`		`- // wait for the async copies`
`967`		`- while (cudaErrorNotReady == cudaStreamQuery(stream)) { /empty/`
`968`		`- }`
	`966`	`+ cudaStreamSynchronize(stream);`
`969`	`967`	`}`
`970`	`968`	`mem_temp.Free(inclusive_scan_temp);`
`971`	`969`	`}`