|
| 1 | +# Copyright 2025 NVIDIA Corporation |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 with the LLVM exception |
| 4 | +# (the "License"); you may not use this file except in compliance with |
| 5 | +# the License. |
| 6 | +# |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://llvm.org/foundation/relicensing/LICENSE.txt |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +import sys |
| 18 | + |
| 19 | +import cuda.cccl.cooperative.experimental as coop |
| 20 | +import cuda.nvbench as nvbench |
| 21 | +import numba |
| 22 | +import numpy as np |
| 23 | +from numba import cuda |
| 24 | +from pynvjitlink import patch |
| 25 | + |
| 26 | + |
| 27 | +class BitsetRing: |
| 28 | + """ |
| 29 | + Addition operation over ring fixed width unsigned integers |
| 30 | + with ring_plus = bitwise_or and ring_mul = bitwise_and, |
| 31 | + ring_zero = 0, ring_one = -1 |
| 32 | + """ |
| 33 | + |
| 34 | + def __init__(self): |
| 35 | + self.dt = np.uint64 |
| 36 | + self.zero = self.dt(0) |
| 37 | + self.one = np.bitwise_invert(self.zero) |
| 38 | + |
| 39 | + @staticmethod |
| 40 | + def add(op1, op2): |
| 41 | + return op1 | op2 |
| 42 | + |
| 43 | + @staticmethod |
| 44 | + def mul(op1, op2): |
| 45 | + return op1 & op2 |
| 46 | + |
| 47 | + |
| 48 | +def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: |
| 49 | + return cuda.external_stream(cs.addressof()) |
| 50 | + |
| 51 | + |
| 52 | +def multi_block_bench(state: nvbench.State): |
| 53 | + threads_per_block = state.get_int64("ThreadsPerBlock") |
| 54 | + num_blocks = state.get_int64("NumBlocks") |
| 55 | + total_elements = threads_per_block * num_blocks |
| 56 | + |
| 57 | + if total_elements > 2**26: |
| 58 | + state.skip(reason="Memory footprint over threshold") |
| 59 | + return |
| 60 | + |
| 61 | + ring = BitsetRing() |
| 62 | + block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add) |
| 63 | + |
| 64 | + @cuda.jit(link=block_reduce.files) |
| 65 | + def kernel(inp_arr, out_arr): |
| 66 | + # Each thread contributes one element |
| 67 | + block_idx = cuda.blockIdx.x |
| 68 | + thread_idx = cuda.threadIdx.x |
| 69 | + global_idx = block_idx * threads_per_block + thread_idx |
| 70 | + |
| 71 | + block_output = block_reduce(inp_arr[global_idx]) |
| 72 | + |
| 73 | + # Only thread 0 of each block writes the result |
| 74 | + if thread_idx == 0: |
| 75 | + out_arr[block_idx] = block_output |
| 76 | + |
| 77 | + h_inp = np.arange(1, total_elements + 1, dtype=ring.dt) |
| 78 | + d_inp = cuda.to_device(h_inp) |
| 79 | + d_out = cuda.device_array(num_blocks, dtype=ring.dt) |
| 80 | + |
| 81 | + cuda_s = as_cuda_Stream(state.get_stream()) |
| 82 | + # warmup |
| 83 | + kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out) |
| 84 | + |
| 85 | + state.add_element_count(total_elements) |
| 86 | + state.add_global_memory_reads(total_elements * h_inp.itemsize) |
| 87 | + state.add_global_memory_writes(num_blocks * h_inp.itemsize) |
| 88 | + |
| 89 | + def launcher(launch: nvbench.Launch): |
| 90 | + cuda_s = as_cuda_Stream(launch.get_stream()) |
| 91 | + kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out) |
| 92 | + |
| 93 | + state.exec(launcher) |
| 94 | + |
| 95 | + |
| 96 | +if __name__ == "__main__": |
| 97 | + patch.patch_numba_linker(lto=True) |
| 98 | + |
| 99 | + b = nvbench.register(multi_block_bench) |
| 100 | + b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256]) |
| 101 | + b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16]) |
| 102 | + |
| 103 | + nvbench.run_all_benchmarks(sys.argv) |
0 commit comments