Skip to content

Commit bdf9fda

Browse files
Add examples/exec_tag_sync.py
1 parent d255388 commit bdf9fda

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

python/examples/exec_tag_sync.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import ctypes
2+
import sys
3+
from typing import Optional
4+
5+
import cuda.cccl.headers as headers
6+
import cuda.core.experimental as core
7+
import cuda.nvbench as nvbench
8+
9+
10+
def make_fill_kernel(data_type: Optional[str] = None):
11+
src = r"""
12+
#include <cuda/std/cstdint>
13+
#include <cuda/std/cstddef>
14+
/*!
15+
* Naive setting of values in buffer
16+
*/
17+
template <typename T>
18+
__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
19+
{
20+
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
21+
const auto step = blockDim.x * gridDim.x;
22+
23+
for (auto i = init; i < n; i += step)
24+
{
25+
buf[i] = v;
26+
}
27+
}
28+
"""
29+
incl = headers.get_include_paths()
30+
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
31+
prog = core.Program(src, code_type="c++", options=opts)
32+
if data_type is None:
33+
data_type = "::cuda::std::int32_t"
34+
instance_name = f"fill_kernel<{data_type}>"
35+
mod = prog.compile("cubin", name_expressions=(instance_name,))
36+
return mod.get_kernel(instance_name)
37+
38+
39+
def synchronizing_bench(state: nvbench.State):
40+
n_values = 64 * 1024 * 1024
41+
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
42+
43+
dev = core.Device(state.getDevice())
44+
dev.set_current()
45+
46+
alloc_stream = dev.create_stream(state.getStream())
47+
buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
48+
49+
state.addElementCount(n_values, "Items")
50+
state.addGlobalMemoryWrites(n_bytes, "Size")
51+
52+
krn = make_fill_kernel()
53+
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
54+
55+
def launcher(launch: nvbench.Launch):
56+
dev = core.Device()
57+
dev.set_current()
58+
59+
s = dev.create_stream(launch.getStream())
60+
core.launch(s, launch_config, krn, buffer, 0, n_values)
61+
s.sync()
62+
63+
# since launcher contains synchronization point,
64+
# setting sync=True is required to avoid a deadlock
65+
state.exec(launcher, sync=True)
66+
67+
68+
if __name__ == "__main__":
69+
nvbench.register(synchronizing_bench)
70+
nvbench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)