Change test and examples from using camelCase to using snake_case as implementation changed

oleksandr-pavlyk · oleksandr-pavlyk · commit 5568b3fc444f · 2025-07-21T12:50:28.000-05:00
diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
@@ -41,18 +41,18 @@ def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
 
 
 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
 
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
-    state.addElementCount(elements, column_name="Elements")
-    state.collectCUPTIMetrics()
+    state.add_element_count(elements, column_name="Elements")
+    state.collect_cupti_metrics()
 
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
@@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None:
     )
 
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
@@ -76,7 +76,7 @@ def launcher(launch: nvbench.Launch):
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/axes.py b/python/examples/axes.py
@@ -43,26 +43,27 @@ def make_sleep_kernel():
 
 
 def simple(state: nvbench.State):
-    state.setMinSamples(1000)
+    state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
 def single_float64_axis(state: nvbench.State):
     # get axis value, or default
-    sleep_dur = state.getFloat64("Duration", 3.14e-4)
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -104,40 +105,40 @@ def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = No
 
 
 def copy_sweep_grid_shape(state: nvbench.State):
-    block_size = state.getInt64("BlockSize")
-    num_blocks = state.getInt64("NumBlocks")
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")
 
     # Number of int32 elements in 256MiB
     nbytes = 256 * 1024 * 1024
     num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
 
-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
 
-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
     input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
     output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
 
 
 def copy_type_sweep(state: nvbench.State):
-    type_id = state.getInt64("TypeID")
+    type_id = state.get_int64("TypeID")
 
     types_map = {
-        0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
-        1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
-        2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
-        3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
         4: (ctypes.c_float, "float"),
         5: (ctypes.c_double, "double"),
     }
@@ -149,20 +150,20 @@ def copy_type_sweep(state: nvbench.State):
     nbytes = 256 * 1024 * 1024
     num_values = nbytes // ctypes.sizeof(value_ctype(0))
 
-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
 
-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
     input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
     output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
@@ -175,13 +176,15 @@ def launcher(launch: nvbench.Launch):
     # benchmark with no axes, that uses default value
     nvbench.register(default_value)
     # specify axis
-    nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
+    nvbench.register(single_float64_axis).add_float64_axis(
+        "Duration", [7e-5, 1e-4, 5e-4]
+    )
 
     copy1_bench = nvbench.register(copy_sweep_grid_shape)
-    copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
-    copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
 
     copy2_bench = nvbench.register(copy_type_sweep)
-    copy2_bench.addInt64Axis("TypeID", range(0, 6))
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
@@ -35,15 +35,15 @@ def as_cp_ExternalStream(
 
 def segmented_reduce(state: nvbench.State):
     "Benchmark segmented_reduce example"
-    n_elems = state.getInt64("numElems")
-    n_cols = state.getInt64("numCols")
+    n_elems = state.get_int64("numElems")
+    n_cols = state.get_int64("numCols")
     n_rows = n_elems // n_cols
 
     state.add_summary("numRows", n_rows)
-    state.collectCUPTIMetrics()
+    state.collect_cupti_metrics()
 
-    dev_id = state.getDevice()
-    cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
 
     with cp_stream:
         rng = cp.random.default_rng()
@@ -75,20 +75,19 @@ def scale(row_id):
         d_input, d_output, start_offsets, end_offsets, add_op, h_init
     )
 
-    # print(1)
-    cccl_stream = as_cccl_Stream(state.getStream())
-    # print(2, core_stream, core_stream.__cuda_stream__())
+    cccl_stream = as_cccl_Stream(state.get_stream())
+
     # query size of temporary storage and allocate
     temp_nbytes = alg(
         None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
     )
     h_init = np.zeros(tuple(), dtype=np.int32)
-    # print(3)
+
     with cp_stream:
         temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
     def launcher(launch: nvbench.Launch):
-        s = as_cccl_Stream(launch.getStream())
+        s = as_cccl_Stream(launch.get_stream())
         alg(
             temp_storage,
             d_input,
@@ -105,7 +104,7 @@ def launcher(launch: nvbench.Launch):
 
 if __name__ == "__main__":
     b = nvbench.register(segmented_reduce)
-    b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
+    b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cpu_only.py b/python/examples/cpu_only.py
@@ -13,6 +13,6 @@ def launcher(launch: nvbench.Launch):
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.setIsCPUOnly(True)
+    b.set_is_cpu_only(True)
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
@@ -12,34 +12,34 @@ def as_cp_ExternalStream(
 
 
 def cupy_extract_by_mask(state: nvbench.State):
-    n_cols = state.getInt64("numCols")
-    n_rows = state.getInt64("numRows")
+    n_cols = state.get_int64("numCols")
+    n_rows = state.get_int64("numRows")
 
-    dev_id = state.getDevice()
-    cp_s = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
 
-    state.collectCUPTIMetrics()
-    state.addElementCount(n_rows * n_cols, "# Elements")
-    state.addGlobalMemoryReads(
+    state.collect_cupti_metrics()
+    state.add_element_count(n_rows * n_cols, "# Elements")
+    state.add_global_memory_reads(
         n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
     )
-    state.addGlobalMemoryWrites(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
+    state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
 
     with cp_s:
         X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
         mask = cp.ones((n_cols, n_rows), dtype="?")
         _ = X[mask]
 
     def launcher(launch: nvbench.Launch):
-        with as_cp_ExternalStream(launch.getStream(), dev_id):
+        with as_cp_ExternalStream(launch.get_stream(), dev_id):
             _ = X[mask]
 
     state.exec(launcher, sync=True)
 
 
 if __name__ == "__main__":
     b = nvbench.register(cupy_extract_by_mask)
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 2 * 4096])
-    b.addInt64Axis("numRows", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
@@ -45,17 +45,17 @@ def synchronizing_bench(state: nvbench.State):
     n_values = 64 * 1024 * 1024
     n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
 
-    alloc_s = as_core_Stream(state.getStream())
-    buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
+    alloc_s = as_core_Stream(state.get_stream())
+    buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
 
-    state.addElementCount(n_values, "Items")
-    state.addGlobalMemoryWrites(n_bytes, "Size")
+    state.add_element_count(n_values, "Items")
+    state.add_global_memory_writes(n_bytes, "Size")
 
     krn = make_fill_kernel()
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, buffer, 0, n_values)
         s.sync()
 
diff --git a/python/examples/skip.py b/python/examples/skip.py
@@ -42,8 +42,8 @@ def make_sleep_kernel():
 
 
 def runtime_skip(state: nvbench.State):
-    duration = state.getFloat64("Duration")
-    kramble = state.getString("Kramble")
+    duration = state.get_float64("Duration")
+    kramble = state.get_string("Kramble")
 
     # Skip Baz benchmarks with 0.8 ms duration
     if kramble == "Baz" and duration < 0.8e-3:
@@ -59,15 +59,15 @@ def runtime_skip(state: nvbench.State):
     launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_cfg, krn, duration)
 
     state.exec(launcher)
 
 
 if __name__ == "__main__":
     b = nvbench.register(runtime_skip)
-    b.addFloat64Axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
-    b.addStringAxis("Kramble", ["Foo", "Bar", "Baz"])
+    b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
+    b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
@@ -41,33 +41,27 @@ def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
 
 
 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
 
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
-    state.addElementCount(elements, column_name="Elements")
-    state.addGlobalMemoryReads(inp_arr.nbytes, column_name="Datasize")
-    state.addGlobalMemoryWrites(inp_arr.nbytes)
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)
 
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
 
     krn = make_kernel(ipt)
 
-    # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched
-    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
-        stride, elements, inp_arr, out_arr
-    )
-
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
@@ -77,7 +71,7 @@ def launcher(launch: nvbench.Launch):
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/test/run_1.py b/python/test/run_1.py