Add examples/exec_tag_sync.py

oleksandr-pavlyk · oleksandr-pavlyk · commit bdf9fda74bc4 · 2025-07-02T11:49:32.000-05:00
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
@@ -0,0 +1,70 @@
+import ctypes
+import sys
+from typing import Optional
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+
+def make_fill_kernel(data_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive setting of values in buffer
+ */
+template <typename T>
+__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    buf[i] = v;
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if data_type is None:
+        data_type = "::cuda::std::int32_t"
+    instance_name = f"fill_kernel<{data_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def synchronizing_bench(state: nvbench.State):
+    n_values = 64 * 1024 * 1024
+    n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
+
+    dev = core.Device(state.getDevice())
+    dev.set_current()
+
+    alloc_stream = dev.create_stream(state.getStream())
+    buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
+
+    state.addElementCount(n_values, "Items")
+    state.addGlobalMemoryWrites(n_bytes, "Size")
+
+    krn = make_fill_kernel()
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+
+        s = dev.create_stream(launch.getStream())
+        core.launch(s, launch_config, krn, buffer, 0, n_values)
+        s.sync()
+
+    # since launcher contains synchronization point,
+    # setting sync=True is required to avoid a deadlock
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    nvbench.register(synchronizing_bench)
+    nvbench.run_all_benchmarks(sys.argv)