|
| 1 | +import sys |
| 2 | + |
| 3 | +import cuda.cccl.parallel.experimental.algorithms as algorithms |
| 4 | +import cuda.cccl.parallel.experimental.iterators as iterators |
| 5 | +import cuda.core.experimental as core |
| 6 | +import cuda.nvbench as nvbench |
| 7 | +import cupy as cp |
| 8 | +import numpy as np |
| 9 | + |
| 10 | + |
| 11 | +def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: |
| 12 | + return core.Stream.from_handle(cs.addressof()) |
| 13 | + |
| 14 | + |
| 15 | +def segmented_reduce(state: nvbench.State): |
| 16 | + "Benchmark segmented_reduce example" |
| 17 | + n_elems = state.getInt64("numElems") |
| 18 | + n_cols = state.getInt64("numCols") |
| 19 | + n_rows = n_elems // n_cols |
| 20 | + |
| 21 | + state.add_summary("numRows", n_rows) |
| 22 | + state.collectCUPTIMetrics() |
| 23 | + |
| 24 | + rng = cp.random.default_rng() |
| 25 | + mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols)) |
| 26 | + |
| 27 | + def add_op(a, b): |
| 28 | + return a + b |
| 29 | + |
| 30 | + def make_scaler(step): |
| 31 | + def scale(row_id): |
| 32 | + return row_id * step |
| 33 | + |
| 34 | + return scale |
| 35 | + |
| 36 | + zero = np.int32(0) |
| 37 | + row_offset = make_scaler(np.int32(n_cols)) |
| 38 | + start_offsets = iterators.TransformIterator( |
| 39 | + iterators.CountingIterator(zero), row_offset |
| 40 | + ) |
| 41 | + |
| 42 | + end_offsets = start_offsets + 1 |
| 43 | + |
| 44 | + d_input = mat |
| 45 | + h_init = np.zeros(tuple(), dtype=np.int32) |
| 46 | + d_output = cp.empty(n_rows, dtype=d_input.dtype) |
| 47 | + |
| 48 | + alg = algorithms.segmented_reduce( |
| 49 | + d_input, d_output, start_offsets, end_offsets, add_op, h_init |
| 50 | + ) |
| 51 | + |
| 52 | + # query size of temporary storage and allocate |
| 53 | + temp_nbytes = alg( |
| 54 | + None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init |
| 55 | + ) |
| 56 | + temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) |
| 57 | + |
| 58 | + def launcher(launch: nvbench.Launch): |
| 59 | + s = as_core_Stream(launch.getStream()) |
| 60 | + alg( |
| 61 | + temp_storage, |
| 62 | + d_input, |
| 63 | + d_output, |
| 64 | + n_rows, |
| 65 | + start_offsets, |
| 66 | + end_offsets, |
| 67 | + h_init, |
| 68 | + s, |
| 69 | + ) |
| 70 | + |
| 71 | + state.exec(launcher) |
| 72 | + |
| 73 | + |
| 74 | +if __name__ == "__main__": |
| 75 | + b = nvbench.register(segmented_reduce) |
| 76 | + b.addInt64Axis("numElems", [2**20, 2**22, 2**24]) |
| 77 | + b.addInt64Axis("numCols", [1024, 2048, 4096, 8192]) |
| 78 | + |
| 79 | + nvbench.run_all_benchmarks(sys.argv) |
0 commit comments