Corrected broken cccl_parallel_segmented_reduce.py

oleksandr-pavlyk · oleksandr-pavlyk · commit c73df3100b05 · 2025-07-03T11:42:16.000-05:00
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
@@ -8,10 +8,31 @@
 import numpy as np
 
 
+class CCCLStream:
+    "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
+
+    def __init__(self, ptr):
+        self._ptr = ptr
+
+    def __cuda_stream__(self):
+        return (0, self._ptr)
+
+
 def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
+def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
+    return CCCLStream(cs.addressof())
+
+
+def as_cp_ExternalStream(
+    cs: nvbench.CudaStream, dev_id: int = -1
+) -> cp.cuda.ExternalStream:
+    h = cs.addressof()
+    return cp.cuda.ExternalStream(h, dev_id)
+
+
 def segmented_reduce(state: nvbench.State):
     "Benchmark segmented_reduce example"
     n_elems = state.getInt64("numElems")
@@ -21,8 +42,12 @@ def segmented_reduce(state: nvbench.State):
     state.add_summary("numRows", n_rows)
     state.collectCUPTIMetrics()
 
-    rng = cp.random.default_rng()
-    mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
+    dev_id = state.getDevice()
+    cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
+
+    with cp_stream:
+        rng = cp.random.default_rng()
+        mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
 
     def add_op(a, b):
         return a + b
@@ -41,22 +66,29 @@ def scale(row_id):
 
     end_offsets = start_offsets + 1
 
-    d_input = mat
     h_init = np.zeros(tuple(), dtype=np.int32)
-    d_output = cp.empty(n_rows, dtype=d_input.dtype)
+    with cp_stream:
+        d_input = mat
+        d_output = cp.empty(n_rows, dtype=d_input.dtype)
 
     alg = algorithms.segmented_reduce(
         d_input, d_output, start_offsets, end_offsets, add_op, h_init
     )
 
+    # print(1)
+    cccl_stream = as_cccl_Stream(state.getStream())
+    # print(2, core_stream, core_stream.__cuda_stream__())
     # query size of temporary storage and allocate
     temp_nbytes = alg(
-        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
+        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
     )
-    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    # print(3)
+    with cp_stream:
+        temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_cccl_Stream(launch.getStream())
         alg(
             temp_storage,
             d_input,