Use cuda.Stream.from_handle to create core.Stream from nvbench.CudaStream

oleksandr-pavlyk · oleksandr-pavlyk · commit b610543e7d3f · 2025-07-02T15:09:12.000-05:00
diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
@@ -63,13 +63,9 @@ def launcher(launch: nvbench.Launch):
     state.exec(launcher)
 
 
-(
-    nvbench.register(throughput_bench)
-    .addInt64Axis("Stride", [1, 4])
-    .addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
-)
-
-
 if __name__ == "__main__":
-    print(nvbench.__version__)
+    b = nvbench.register(throughput_bench)
+    b.addInt64Axis("Stride", [1, 2, 4])
+    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/axes.py b/python/examples/axes.py
@@ -7,6 +7,10 @@
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_sleep_kernel():
     """JITs sleep_kernel(seconds)"""
     src = r"""
@@ -45,10 +49,7 @@ def simple(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -61,10 +62,7 @@ def single_float64_axis(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -117,21 +115,16 @@ def copy_sweep_grid_shape(state: nvbench.State):
     state.addGlobalMemoryReads(nbytes)
     state.addGlobalMemoryWrites(nbytes)
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
-    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    dev_id = state.getDevice()
+    alloc_s = as_core_Stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
@@ -160,21 +153,16 @@ def copy_type_sweep(state: nvbench.State):
     state.addGlobalMemoryReads(nbytes)
     state.addGlobalMemoryWrites(nbytes)
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
-    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    dev_id = state.getDevice()
+    alloc_s = as_core_Stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
@@ -7,6 +7,11 @@
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    "Create view of native stream used by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_fill_kernel(data_type: Optional[str] = None):
     src = r"""
 #include <cuda/std/cstdint>
@@ -40,11 +45,8 @@ def synchronizing_bench(state: nvbench.State):
     n_values = 64 * 1024 * 1024
     n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
+    alloc_s = as_core_Stream(state.getStream())
+    buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
 
     state.addElementCount(n_values, "Items")
     state.addGlobalMemoryWrites(n_bytes, "Size")
@@ -53,10 +55,7 @@ def synchronizing_bench(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-
-        s = dev.create_stream(launch.getStream())
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, buffer, 0, n_values)
         s.sync()
 
diff --git a/python/examples/skip.py b/python/examples/skip.py
@@ -5,6 +5,11 @@
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    "Create view into native stream provided by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_sleep_kernel():
     """JITs sleep_kernel(seconds)"""
     src = r"""
@@ -54,10 +59,7 @@ def runtime_skip(state: nvbench.State):
     launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-
-        s = dev.create_stream(launch.getStream())
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_cfg, krn, duration)
 
     state.exec(launcher)