NVIDIA
diff --git a/‎python/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎python/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cuda/nvbench/__init__.py‎ renamed to ‎python/cuda/bench/__init__.py‎
Lines changed: 7 additions & 7 deletions b/‎python/cuda/nvbench/__init__.py‎ renamed to ‎python/cuda/bench/__init__.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎python/cuda/nvbench/__init__.pyi‎ renamed to ‎python/cuda/bench/__init__.pyi‎
Lines changed: 2 additions & 2 deletions b/‎python/cuda/nvbench/__init__.pyi‎ renamed to ‎python/cuda/bench/__init__.pyi‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/cuda/nvbench/py.typed‎ renamed to ‎python/cuda/bench/py.typed‎ b/‎python/cuda/nvbench/py.typed‎ renamed to ‎python/cuda/bench/py.typed‎
diff --git a/‎python/examples/auto_throughput.py‎
Lines changed: 6 additions & 6 deletions b/‎python/examples/auto_throughput.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/examples/axes.py‎
Lines changed: 17 additions & 17 deletions b/‎python/examples/axes.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎python/examples/cccl_cooperative_block_reduce.py‎
Lines changed: 6 additions & 10 deletions b/‎python/examples/cccl_cooperative_block_reduce.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎python/examples/cccl_parallel_segmented_reduce.py‎
Lines changed: 8 additions & 8 deletions b/‎python/examples/cccl_parallel_segmented_reduce.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎python/examples/cpu_activity.py‎
Lines changed: 9 additions & 9 deletions b/‎python/examples/cpu_activity.py‎
Lines changed: 9 additions & 9 deletions
@@ -33,9 +33,9 @@ set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-install(TARGETS _nvbench DESTINATION cuda/nvbench)
+install(TARGETS _nvbench DESTINATION cuda/bench)
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
 get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
-install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
@@ -34,25 +34,25 @@
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     load_nvidia_dynamic_lib(libname)
 
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     Benchmark as Benchmark,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     CudaStream as CudaStream,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     Launch as Launch,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     NVBenchRuntimeError as NVBenchRuntimeError,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     State as State,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     register as register,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     run_all_benchmarks as run_all_benchmarks,
 )
 
 
@@ -44,9 +44,9 @@ class CudaStream:
         Example
         -------
             import cuda.core.experimental as core
-            import cuda.nvbench as nvbench
+            import cuda.bench as bench
 
-            def bench(state: nvbench.State):
+            def bench(state: bench.State):
                 dev = core.Device(state.get_device())
                 dev.set_current()
                 # converts CudaString to core.Stream
 
@@ -16,12 +16,12 @@
 
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import numpy as np
 from numba import cuda
 
 
-def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
@@ -39,7 +39,7 @@ def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
     return kernel
 
 
-def throughput_bench(state: nvbench.State) -> None:
+def throughput_bench(state: bench.State) -> None:
     stride = state.get_int64("Stride")
     ipt = state.get_int64("ItemsPerThread")
 
@@ -58,7 +58,7 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_throughput_kernel(ipt)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
@@ -68,8 +68,8 @@ def launcher(launch: nvbench.Launch):
 
 
 if __name__ == "__main__":
-    b = nvbench.register(throughput_bench)
+    b = bench.register(throughput_bench)
     b.add_int64_axis("Stride", [1, 2, 4])
     b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
@@ -18,12 +18,12 @@
 import sys
 from typing import Dict, Optional, Tuple
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -58,34 +58,34 @@ def make_sleep_kernel():
     return mod.get_kernel("sleep_kernel")
 
 
-def simple(state: nvbench.State):
+def simple(state: bench.State):
     state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def single_float64_axis(state: nvbench.State):
+def single_float64_axis(state: bench.State):
     # get axis value, or default
     default_sleep_dur = 3.14e-4
     sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def default_value(state: nvbench.State):
+def default_value(state: bench.State):
     single_float64_axis(state)
 
 
@@ -120,7 +120,7 @@ def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = No
     return mod.get_kernel(instance_name)
 
 
-def copy_sweep_grid_shape(state: nvbench.State):
+def copy_sweep_grid_shape(state: bench.State):
     block_size = state.get_int64("BlockSize")
     num_blocks = state.get_int64("NumBlocks")
 
@@ -140,14 +140,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
 
 
-def copy_type_sweep(state: nvbench.State):
+def copy_type_sweep(state: bench.State):
     type_id = state.get_int64("TypeID")
 
     types_map: Dict[int, Tuple[type, str]] = {
@@ -178,7 +178,7 @@ def copy_type_sweep(state: nvbench.State):
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
@@ -187,20 +187,20 @@ def launcher(launch: nvbench.Launch):
 
 if __name__ == "__main__":
     # Benchmark without axes
-    nvbench.register(simple)
+    bench.register(simple)
 
     # benchmark with no axes, that uses default value
-    nvbench.register(default_value)
+    bench.register(default_value)
     # specify axis
-    nvbench.register(single_float64_axis).add_float64_axis(
+    bench.register(single_float64_axis).add_float64_axis(
         "Duration (s)", [7e-5, 1e-4, 5e-4]
     )
 
-    copy1_bench = nvbench.register(copy_sweep_grid_shape)
+    copy1_bench = bench.register(copy_sweep_grid_shape)
     copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
     copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
 
-    copy2_bench = nvbench.register(copy_type_sweep)
+    copy2_bench = bench.register(copy_type_sweep)
     copy2_bench.add_int64_axis("TypeID", range(0, 6))
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
@@ -16,8 +16,8 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.cccl.cooperative.experimental as coop
-import cuda.nvbench as nvbench
 import numba
 import numpy as np
 from numba import cuda
@@ -45,11 +45,11 @@ def mul(op1, op2):
         return op1 & op2
 
 
-def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def multi_block_bench(state: nvbench.State):
+def multi_block_bench(state: bench.State):
     threads_per_block = state.get_int64("ThreadsPerBlock")
     num_blocks = state.get_int64("NumBlocks")
     total_elements = threads_per_block * num_blocks
@@ -78,15 +78,11 @@ def kernel(inp_arr, out_arr):
     d_inp = cuda.to_device(h_inp)
     d_out = cuda.device_array(num_blocks, dtype=ring.dt)
 
-    cuda_s = as_cuda_Stream(state.get_stream())
-    # warmup
-    kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
-
     state.add_element_count(total_elements)
     state.add_global_memory_reads(total_elements * h_inp.itemsize)
     state.add_global_memory_writes(num_blocks * h_inp.itemsize)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         cuda_s = as_cuda_Stream(launch.get_stream())
         kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
 
@@ -96,8 +92,8 @@ def launcher(launch: nvbench.Launch):
 if __name__ == "__main__":
     patch.patch_numba_linker(lto=True)
 
-    b = nvbench.register(multi_block_bench)
+    b = bench.register(multi_block_bench)
     b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
     b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
@@ -16,10 +16,10 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.cccl.parallel.experimental.algorithms as algorithms
 import cuda.cccl.parallel.experimental.iterators as iterators
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 import cupy as cp
 import numpy as np
 
@@ -34,22 +34,22 @@ def __cuda_stream__(self):
         return (0, self._ptr)
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
-def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
+def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
     return CCCLStream(cs.addressof())
 
 
 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int | None = -1
+    cs: bench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
     h = cs.addressof()
     return cp.cuda.ExternalStream(h, dev_id)
 
 
-def segmented_reduce(state: nvbench.State):
+def segmented_reduce(state: bench.State):
     "Benchmark segmented_reduce example"
     n_elems = state.get_int64("numElems")
     n_cols = state.get_int64("numCols")
@@ -100,7 +100,7 @@ def scale(row_id):
     with cp_stream:
         temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_cccl_Stream(launch.get_stream())
         alg(
             temp_storage,
@@ -117,8 +117,8 @@ def launcher(launch: nvbench.Launch):
 
 
 if __name__ == "__main__":
-    b = nvbench.register(segmented_reduce)
+    b = bench.register(segmented_reduce)
     b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
     b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
@@ -17,21 +17,21 @@
 import sys
 import time
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 host_sleep_duration = 0.1
 
 
-def cpu_only_sleep_bench(state: nvbench.State) -> None:
-    def launcher(launch: nvbench.Launch):
+def cpu_only_sleep_bench(state: bench.State) -> None:
+    def launcher(launch: bench.Launch):
         time.sleep(host_sleep_duration)
 
     state.exec(launcher)
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -66,15 +66,15 @@ def make_sleep_kernel():
     return mod.get_kernel("sleep_kernel")
 
 
-def mixed_sleep_bench(state: nvbench.State) -> None:
+def mixed_sleep_bench(state: bench.State) -> None:
     sync = state.get_string("Sync")
     sync_flag = sync == "Do sync"
 
     gpu_sleep_dur = 225e-3
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         # host overhead
         time.sleep(host_sleep_duration)
         # GPU computation
@@ -87,11 +87,11 @@ def launcher(launch: nvbench.Launch):
 if __name__ == "__main__":
     # time function only doing work (sleeping) on the host
     # using CPU timer only
-    b = nvbench.register(cpu_only_sleep_bench)
+    b = bench.register(cpu_only_sleep_bench)
     b.set_is_cpu_only(True)
 
     # time the function that does work on both GPU and CPU
-    b2 = nvbench.register(mixed_sleep_bench)
+    b2 = bench.register(mixed_sleep_bench)
     b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
Original file line number	Diff line number	Diff line change
`@@ -34,25 +34,25 @@`
`34`	`34`	`for libname in ("cupti", "nvperf_target", "nvperf_host"):`
`35`	`35`	`load_nvidia_dynamic_lib(libname)`
`36`	`36`
`37`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`37`	`+from cuda.bench._nvbench import ( # noqa: E402`
`38`	`38`	`Benchmark as Benchmark,`
`39`	`39`	`)`
`40`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`40`	`+from cuda.bench._nvbench import ( # noqa: E402`
`41`	`41`	`CudaStream as CudaStream,`
`42`	`42`	`)`
`43`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`43`	`+from cuda.bench._nvbench import ( # noqa: E402`
`44`	`44`	`Launch as Launch,`
`45`	`45`	`)`
`46`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`46`	`+from cuda.bench._nvbench import ( # noqa: E402`
`47`	`47`	`NVBenchRuntimeError as NVBenchRuntimeError,`
`48`	`48`	`)`
`49`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`49`	`+from cuda.bench._nvbench import ( # noqa: E402`
`50`	`50`	`State as State,`
`51`	`51`	`)`
`52`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`52`	`+from cuda.bench._nvbench import ( # noqa: E402`
`53`	`53`	`register as register,`
`54`	`54`	`)`
`55`		`-from cuda.nvbench._nvbench import ( # noqa: E402`
	`55`	`+from cuda.bench._nvbench import ( # noqa: E402`
`56`	`56`	`run_all_benchmarks as run_all_benchmarks,`
`57`	`57`	`)`
`58`	`58`