Skip to content

Commit b5e4b4b

Browse files
cuda.nvbench -> cuda.bench
Per PR review suggestion: - `cuda.parallel` - device-wide algorithms/Thrust - `cuda.cooperative` - Cooperative algorithsm/CUB - `cuda.bench` - Benchmarking/NVBench
1 parent c2a2acc commit b5e4b4b

19 files changed

+136
-140
lines changed

python/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
3333
set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
3434
set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
3535

36-
install(TARGETS _nvbench DESTINATION cuda/nvbench)
36+
install(TARGETS _nvbench DESTINATION cuda/bench)
3737

3838
# Determine target that nvbench::nvbench is an alias of,
3939
# necessary because ALIAS targets cannot be installed
4040
get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
41-
install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench)
41+
install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)

python/cuda/nvbench/__init__.py renamed to python/cuda/bench/__init__.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,25 +34,25 @@
3434
for libname in ("cupti", "nvperf_target", "nvperf_host"):
3535
load_nvidia_dynamic_lib(libname)
3636

37-
from cuda.nvbench._nvbench import ( # noqa: E402
37+
from cuda.bench._nvbench import ( # noqa: E402
3838
Benchmark as Benchmark,
3939
)
40-
from cuda.nvbench._nvbench import ( # noqa: E402
40+
from cuda.bench._nvbench import ( # noqa: E402
4141
CudaStream as CudaStream,
4242
)
43-
from cuda.nvbench._nvbench import ( # noqa: E402
43+
from cuda.bench._nvbench import ( # noqa: E402
4444
Launch as Launch,
4545
)
46-
from cuda.nvbench._nvbench import ( # noqa: E402
46+
from cuda.bench._nvbench import ( # noqa: E402
4747
NVBenchRuntimeError as NVBenchRuntimeError,
4848
)
49-
from cuda.nvbench._nvbench import ( # noqa: E402
49+
from cuda.bench._nvbench import ( # noqa: E402
5050
State as State,
5151
)
52-
from cuda.nvbench._nvbench import ( # noqa: E402
52+
from cuda.bench._nvbench import ( # noqa: E402
5353
register as register,
5454
)
55-
from cuda.nvbench._nvbench import ( # noqa: E402
55+
from cuda.bench._nvbench import ( # noqa: E402
5656
run_all_benchmarks as run_all_benchmarks,
5757
)
5858

python/cuda/nvbench/__init__.pyi renamed to python/cuda/bench/__init__.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ class CudaStream:
4444
Example
4545
-------
4646
import cuda.core.experimental as core
47-
import cuda.nvbench as nvbench
47+
import cuda.bench as bench
4848
49-
def bench(state: nvbench.State):
49+
def bench(state: bench.State):
5050
dev = core.Device(state.get_device())
5151
dev.set_current()
5252
# converts CudaString to core.Stream
File renamed without changes.

python/examples/auto_throughput.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616

1717
import sys
1818

19-
import cuda.nvbench as nvbench
19+
import cuda.bench as bench
2020
import numpy as np
2121
from numba import cuda
2222

2323

24-
def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
24+
def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
2525
return cuda.external_stream(cs.addressof())
2626

2727

@@ -39,7 +39,7 @@ def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
3939
return kernel
4040

4141

42-
def throughput_bench(state: nvbench.State) -> None:
42+
def throughput_bench(state: bench.State) -> None:
4343
stride = state.get_int64("Stride")
4444
ipt = state.get_int64("ItemsPerThread")
4545

@@ -58,7 +58,7 @@ def throughput_bench(state: nvbench.State) -> None:
5858

5959
krn = make_throughput_kernel(ipt)
6060

61-
def launcher(launch: nvbench.Launch):
61+
def launcher(launch: bench.Launch):
6262
exec_stream = as_cuda_stream(launch.get_stream())
6363
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
6464
stride, elements, inp_arr, out_arr
@@ -68,8 +68,8 @@ def launcher(launch: nvbench.Launch):
6868

6969

7070
if __name__ == "__main__":
71-
b = nvbench.register(throughput_bench)
71+
b = bench.register(throughput_bench)
7272
b.add_int64_axis("Stride", [1, 2, 4])
7373
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
7474

75-
nvbench.run_all_benchmarks(sys.argv)
75+
bench.run_all_benchmarks(sys.argv)

python/examples/axes.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
import sys
1919
from typing import Dict, Optional, Tuple
2020

21+
import cuda.bench as bench
2122
import cuda.cccl.headers as headers
2223
import cuda.core.experimental as core
23-
import cuda.nvbench as nvbench
2424

2525

26-
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
26+
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
2727
return core.Stream.from_handle(cs.addressof())
2828

2929

@@ -58,34 +58,34 @@ def make_sleep_kernel():
5858
return mod.get_kernel("sleep_kernel")
5959

6060

61-
def simple(state: nvbench.State):
61+
def simple(state: bench.State):
6262
state.set_min_samples(1000)
6363
sleep_dur = 1e-3
6464
krn = make_sleep_kernel()
6565
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
6666

67-
def launcher(launch: nvbench.Launch):
67+
def launcher(launch: bench.Launch):
6868
s = as_core_Stream(launch.get_stream())
6969
core.launch(s, launch_config, krn, sleep_dur)
7070

7171
state.exec(launcher)
7272

7373

74-
def single_float64_axis(state: nvbench.State):
74+
def single_float64_axis(state: bench.State):
7575
# get axis value, or default
7676
default_sleep_dur = 3.14e-4
7777
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
7878
krn = make_sleep_kernel()
7979
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
8080

81-
def launcher(launch: nvbench.Launch):
81+
def launcher(launch: bench.Launch):
8282
s = as_core_Stream(launch.get_stream())
8383
core.launch(s, launch_config, krn, sleep_dur)
8484

8585
state.exec(launcher)
8686

8787

88-
def default_value(state: nvbench.State):
88+
def default_value(state: bench.State):
8989
single_float64_axis(state)
9090

9191

@@ -120,7 +120,7 @@ def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = No
120120
return mod.get_kernel(instance_name)
121121

122122

123-
def copy_sweep_grid_shape(state: nvbench.State):
123+
def copy_sweep_grid_shape(state: bench.State):
124124
block_size = state.get_int64("BlockSize")
125125
num_blocks = state.get_int64("NumBlocks")
126126

@@ -140,14 +140,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
140140
krn = make_copy_kernel()
141141
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
142142

143-
def launcher(launch: nvbench.Launch):
143+
def launcher(launch: bench.Launch):
144144
s = as_core_Stream(launch.get_stream())
145145
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
146146

147147
state.exec(launcher)
148148

149149

150-
def copy_type_sweep(state: nvbench.State):
150+
def copy_type_sweep(state: bench.State):
151151
type_id = state.get_int64("TypeID")
152152

153153
types_map: Dict[int, Tuple[type, str]] = {
@@ -178,7 +178,7 @@ def copy_type_sweep(state: nvbench.State):
178178
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
179179
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
180180

181-
def launcher(launch: nvbench.Launch):
181+
def launcher(launch: bench.Launch):
182182
s = as_core_Stream(launch.get_stream())
183183
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
184184

@@ -187,20 +187,20 @@ def launcher(launch: nvbench.Launch):
187187

188188
if __name__ == "__main__":
189189
# Benchmark without axes
190-
nvbench.register(simple)
190+
bench.register(simple)
191191

192192
# benchmark with no axes, that uses default value
193-
nvbench.register(default_value)
193+
bench.register(default_value)
194194
# specify axis
195-
nvbench.register(single_float64_axis).add_float64_axis(
195+
bench.register(single_float64_axis).add_float64_axis(
196196
"Duration (s)", [7e-5, 1e-4, 5e-4]
197197
)
198198

199-
copy1_bench = nvbench.register(copy_sweep_grid_shape)
199+
copy1_bench = bench.register(copy_sweep_grid_shape)
200200
copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
201201
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
202202

203-
copy2_bench = nvbench.register(copy_type_sweep)
203+
copy2_bench = bench.register(copy_type_sweep)
204204
copy2_bench.add_int64_axis("TypeID", range(0, 6))
205205

206-
nvbench.run_all_benchmarks(sys.argv)
206+
bench.run_all_benchmarks(sys.argv)

python/examples/cccl_cooperative_block_reduce.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
import sys
1818

19+
import cuda.bench as bench
1920
import cuda.cccl.cooperative.experimental as coop
20-
import cuda.nvbench as nvbench
2121
import numba
2222
import numpy as np
2323
from numba import cuda
@@ -45,11 +45,11 @@ def mul(op1, op2):
4545
return op1 & op2
4646

4747

48-
def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
48+
def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
4949
return cuda.external_stream(cs.addressof())
5050

5151

52-
def multi_block_bench(state: nvbench.State):
52+
def multi_block_bench(state: bench.State):
5353
threads_per_block = state.get_int64("ThreadsPerBlock")
5454
num_blocks = state.get_int64("NumBlocks")
5555
total_elements = threads_per_block * num_blocks
@@ -78,15 +78,11 @@ def kernel(inp_arr, out_arr):
7878
d_inp = cuda.to_device(h_inp)
7979
d_out = cuda.device_array(num_blocks, dtype=ring.dt)
8080

81-
cuda_s = as_cuda_Stream(state.get_stream())
82-
# warmup
83-
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
84-
8581
state.add_element_count(total_elements)
8682
state.add_global_memory_reads(total_elements * h_inp.itemsize)
8783
state.add_global_memory_writes(num_blocks * h_inp.itemsize)
8884

89-
def launcher(launch: nvbench.Launch):
85+
def launcher(launch: bench.Launch):
9086
cuda_s = as_cuda_Stream(launch.get_stream())
9187
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
9288

@@ -96,8 +92,8 @@ def launcher(launch: nvbench.Launch):
9692
if __name__ == "__main__":
9793
patch.patch_numba_linker(lto=True)
9894

99-
b = nvbench.register(multi_block_bench)
95+
b = bench.register(multi_block_bench)
10096
b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
10197
b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
10298

103-
nvbench.run_all_benchmarks(sys.argv)
99+
bench.run_all_benchmarks(sys.argv)

python/examples/cccl_parallel_segmented_reduce.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
import sys
1818

19+
import cuda.bench as bench
1920
import cuda.cccl.parallel.experimental.algorithms as algorithms
2021
import cuda.cccl.parallel.experimental.iterators as iterators
2122
import cuda.core.experimental as core
22-
import cuda.nvbench as nvbench
2323
import cupy as cp
2424
import numpy as np
2525

@@ -34,22 +34,22 @@ def __cuda_stream__(self):
3434
return (0, self._ptr)
3535

3636

37-
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
37+
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
3838
return core.Stream.from_handle(cs.addressof())
3939

4040

41-
def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
41+
def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
4242
return CCCLStream(cs.addressof())
4343

4444

4545
def as_cp_ExternalStream(
46-
cs: nvbench.CudaStream, dev_id: int | None = -1
46+
cs: bench.CudaStream, dev_id: int | None = -1
4747
) -> cp.cuda.ExternalStream:
4848
h = cs.addressof()
4949
return cp.cuda.ExternalStream(h, dev_id)
5050

5151

52-
def segmented_reduce(state: nvbench.State):
52+
def segmented_reduce(state: bench.State):
5353
"Benchmark segmented_reduce example"
5454
n_elems = state.get_int64("numElems")
5555
n_cols = state.get_int64("numCols")
@@ -100,7 +100,7 @@ def scale(row_id):
100100
with cp_stream:
101101
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
102102

103-
def launcher(launch: nvbench.Launch):
103+
def launcher(launch: bench.Launch):
104104
s = as_cccl_Stream(launch.get_stream())
105105
alg(
106106
temp_storage,
@@ -117,8 +117,8 @@ def launcher(launch: nvbench.Launch):
117117

118118

119119
if __name__ == "__main__":
120-
b = nvbench.register(segmented_reduce)
120+
b = bench.register(segmented_reduce)
121121
b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
122122
b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
123123

124-
nvbench.run_all_benchmarks(sys.argv)
124+
bench.run_all_benchmarks(sys.argv)

python/examples/cpu_activity.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,21 @@
1717
import sys
1818
import time
1919

20+
import cuda.bench as bench
2021
import cuda.cccl.headers as headers
2122
import cuda.core.experimental as core
22-
import cuda.nvbench as nvbench
2323

2424
host_sleep_duration = 0.1
2525

2626

27-
def cpu_only_sleep_bench(state: nvbench.State) -> None:
28-
def launcher(launch: nvbench.Launch):
27+
def cpu_only_sleep_bench(state: bench.State) -> None:
28+
def launcher(launch: bench.Launch):
2929
time.sleep(host_sleep_duration)
3030

3131
state.exec(launcher)
3232

3333

34-
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
34+
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
3535
return core.Stream.from_handle(cs.addressof())
3636

3737

@@ -66,15 +66,15 @@ def make_sleep_kernel():
6666
return mod.get_kernel("sleep_kernel")
6767

6868

69-
def mixed_sleep_bench(state: nvbench.State) -> None:
69+
def mixed_sleep_bench(state: bench.State) -> None:
7070
sync = state.get_string("Sync")
7171
sync_flag = sync == "Do sync"
7272

7373
gpu_sleep_dur = 225e-3
7474
krn = make_sleep_kernel()
7575
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
7676

77-
def launcher(launch: nvbench.Launch):
77+
def launcher(launch: bench.Launch):
7878
# host overhead
7979
time.sleep(host_sleep_duration)
8080
# GPU computation
@@ -87,11 +87,11 @@ def launcher(launch: nvbench.Launch):
8787
if __name__ == "__main__":
8888
# time function only doing work (sleeping) on the host
8989
# using CPU timer only
90-
b = nvbench.register(cpu_only_sleep_bench)
90+
b = bench.register(cpu_only_sleep_bench)
9191
b.set_is_cpu_only(True)
9292

9393
# time the function that does work on both GPU and CPU
94-
b2 = nvbench.register(mixed_sleep_bench)
94+
b2 = bench.register(mixed_sleep_bench)
9595
b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
9696

97-
nvbench.run_all_benchmarks(sys.argv)
97+
bench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)