Skip to content

Commit 4772e9b

Browse files
Add examples/cccl_parallel_segmented_reduce.py
1 parent b610543 commit 4772e9b

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import sys
2+
3+
import cuda.cccl.parallel.experimental.algorithms as algorithms
4+
import cuda.cccl.parallel.experimental.iterators as iterators
5+
import cuda.core.experimental as core
6+
import cuda.nvbench as nvbench
7+
import cupy as cp
8+
import numpy as np
9+
10+
11+
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
12+
return core.Stream.from_handle(cs.addressof())
13+
14+
15+
def segmented_reduce(state: nvbench.State):
16+
"Benchmark segmented_reduce example"
17+
n_elems = state.getInt64("numElems")
18+
n_cols = state.getInt64("numCols")
19+
n_rows = n_elems // n_cols
20+
21+
state.add_summary("numRows", n_rows)
22+
state.collectCUPTIMetrics()
23+
24+
rng = cp.random.default_rng()
25+
mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
26+
27+
def add_op(a, b):
28+
return a + b
29+
30+
def make_scaler(step):
31+
def scale(row_id):
32+
return row_id * step
33+
34+
return scale
35+
36+
zero = np.int32(0)
37+
row_offset = make_scaler(np.int32(n_cols))
38+
start_offsets = iterators.TransformIterator(
39+
iterators.CountingIterator(zero), row_offset
40+
)
41+
42+
end_offsets = start_offsets + 1
43+
44+
d_input = mat
45+
h_init = np.zeros(tuple(), dtype=np.int32)
46+
d_output = cp.empty(n_rows, dtype=d_input.dtype)
47+
48+
alg = algorithms.segmented_reduce(
49+
d_input, d_output, start_offsets, end_offsets, add_op, h_init
50+
)
51+
52+
# query size of temporary storage and allocate
53+
temp_nbytes = alg(
54+
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
55+
)
56+
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
57+
58+
def launcher(launch: nvbench.Launch):
59+
s = as_core_Stream(launch.getStream())
60+
alg(
61+
temp_storage,
62+
d_input,
63+
d_output,
64+
n_rows,
65+
start_offsets,
66+
end_offsets,
67+
h_init,
68+
s,
69+
)
70+
71+
state.exec(launcher)
72+
73+
74+
if __name__ == "__main__":
75+
b = nvbench.register(segmented_reduce)
76+
b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
77+
b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
78+
79+
nvbench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)