Skip to content

Commit c73df31

Browse files
Corrected broken cccl_parallel_segmented_reduce.py
1 parent ae6728f commit c73df31

File tree

1 file changed

+39
-7
lines changed

1 file changed

+39
-7
lines changed

python/examples/cccl_parallel_segmented_reduce.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,31 @@
88
import numpy as np
99

1010

11+
class CCCLStream:
12+
"Class to work around https://github.com/NVIDIA/cccl/issues/5144"
13+
14+
def __init__(self, ptr):
15+
self._ptr = ptr
16+
17+
def __cuda_stream__(self):
18+
return (0, self._ptr)
19+
20+
1121
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
1222
return core.Stream.from_handle(cs.addressof())
1323

1424

25+
def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
26+
return CCCLStream(cs.addressof())
27+
28+
29+
def as_cp_ExternalStream(
30+
cs: nvbench.CudaStream, dev_id: int = -1
31+
) -> cp.cuda.ExternalStream:
32+
h = cs.addressof()
33+
return cp.cuda.ExternalStream(h, dev_id)
34+
35+
1536
def segmented_reduce(state: nvbench.State):
1637
"Benchmark segmented_reduce example"
1738
n_elems = state.getInt64("numElems")
@@ -21,8 +42,12 @@ def segmented_reduce(state: nvbench.State):
2142
state.add_summary("numRows", n_rows)
2243
state.collectCUPTIMetrics()
2344

24-
rng = cp.random.default_rng()
25-
mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
45+
dev_id = state.getDevice()
46+
cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
47+
48+
with cp_stream:
49+
rng = cp.random.default_rng()
50+
mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
2651

2752
def add_op(a, b):
2853
return a + b
@@ -41,22 +66,29 @@ def scale(row_id):
4166

4267
end_offsets = start_offsets + 1
4368

44-
d_input = mat
4569
h_init = np.zeros(tuple(), dtype=np.int32)
46-
d_output = cp.empty(n_rows, dtype=d_input.dtype)
70+
with cp_stream:
71+
d_input = mat
72+
d_output = cp.empty(n_rows, dtype=d_input.dtype)
4773

4874
alg = algorithms.segmented_reduce(
4975
d_input, d_output, start_offsets, end_offsets, add_op, h_init
5076
)
5177

78+
# print(1)
79+
cccl_stream = as_cccl_Stream(state.getStream())
80+
# print(2, core_stream, core_stream.__cuda_stream__())
5281
# query size of temporary storage and allocate
5382
temp_nbytes = alg(
54-
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
83+
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
5584
)
56-
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
85+
h_init = np.zeros(tuple(), dtype=np.int32)
86+
# print(3)
87+
with cp_stream:
88+
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
5789

5890
def launcher(launch: nvbench.Launch):
59-
s = as_core_Stream(launch.getStream())
91+
s = as_cccl_Stream(launch.getStream())
6092
alg(
6193
temp_storage,
6294
d_input,

0 commit comments

Comments
 (0)