Skip to content

Commit c32e229

Browse files
Add examples/axes.py
1 parent d5a6b1a commit c32e229

File tree

1 file changed

+198
-0
lines changed

1 file changed

+198
-0
lines changed

python/examples/axes.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import ctypes
2+
import sys
3+
from typing import Optional
4+
5+
import cuda.cccl.headers as headers
6+
import cuda.core.experimental as core
7+
import cuda.nvbench as nvbench
8+
9+
10+
def make_sleep_kernel():
11+
"""JITs sleep_kernel(seconds)"""
12+
src = r"""
13+
#include <cuda/std/cstdint>
14+
#include <cuda/std/chrono>
15+
16+
// Each launched thread just sleeps for `seconds`.
17+
__global__ void sleep_kernel(double seconds) {
18+
namespace chrono = ::cuda::std::chrono;
19+
using hr_clock = chrono::high_resolution_clock;
20+
21+
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
22+
const auto ns = chrono::nanoseconds(duration);
23+
24+
const auto start = hr_clock::now();
25+
const auto finish = start + ns;
26+
27+
auto now = hr_clock::now();
28+
while (now < finish)
29+
{
30+
now = hr_clock::now();
31+
}
32+
}
33+
"""
34+
incl = headers.get_include_paths()
35+
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
36+
prog = core.Program(src, code_type="c++", options=opts)
37+
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
38+
return mod.get_kernel("sleep_kernel")
39+
40+
41+
def simple(state: nvbench.State):
42+
state.setMinSamples(1000)
43+
sleep_dur = 1e-3
44+
krn = make_sleep_kernel()
45+
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
46+
47+
def launcher(launch: nvbench.Launch):
48+
dev = core.Device()
49+
dev.set_current()
50+
s = dev.create_stream(launch.getStream())
51+
52+
core.launch(s, launch_config, krn, sleep_dur)
53+
54+
state.exec(launcher)
55+
56+
57+
def single_float64_axis(state: nvbench.State):
58+
# get axis value, or default
59+
sleep_dur = state.getFloat64("Duration", 3.14e-4)
60+
krn = make_sleep_kernel()
61+
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
62+
63+
def launcher(launch: nvbench.Launch):
64+
dev = core.Device()
65+
dev.set_current()
66+
s = dev.create_stream(launch.getStream())
67+
68+
core.launch(s, launch_config, krn, sleep_dur)
69+
70+
state.exec(launcher)
71+
72+
73+
def default_value(state: nvbench.State):
74+
single_float64_axis(state)
75+
76+
77+
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
78+
src = r"""
79+
#include <cuda/std/cstdint>
80+
#include <cuda/std/cstddef>
81+
/*!
82+
* Naive copy of `n` values from `in` -> `out`.
83+
*/
84+
template <typename T, typename U>
85+
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
86+
{
87+
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
88+
const auto step = blockDim.x * gridDim.x;
89+
90+
for (auto i = init; i < n; i += step)
91+
{
92+
out[i] = static_cast<U>(in[i]);
93+
}
94+
}
95+
"""
96+
incl = headers.get_include_paths()
97+
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
98+
prog = core.Program(src, code_type="c++", options=opts)
99+
if in_type is None:
100+
in_type = "::cuda::std::int32_t"
101+
if out_type is None:
102+
out_type = "::cuda::std::int32_t"
103+
instance_name = f"copy_kernel<{in_type}, {out_type}>"
104+
mod = prog.compile("cubin", name_expressions=(instance_name,))
105+
return mod.get_kernel(instance_name)
106+
107+
108+
def copy_sweep_grid_shape(state: nvbench.State):
109+
block_size = state.getInt64("BlockSize")
110+
num_blocks = state.getInt64("NumBlocks")
111+
112+
# Number of int32 elements in 256MiB
113+
nbytes = 256 * 1024 * 1024
114+
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
115+
116+
state.addElementCount(num_values)
117+
state.addGlobalMemoryReads(nbytes)
118+
state.addGlobalMemoryWrites(nbytes)
119+
120+
dev = core.Device(state.getDevice())
121+
dev.set_current()
122+
123+
alloc_stream = dev.create_stream(state.getStream())
124+
input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
125+
output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
126+
127+
krn = make_copy_kernel()
128+
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
129+
130+
def launcher(launch: nvbench.Launch):
131+
dev = core.Device()
132+
dev.set_current()
133+
s = dev.create_stream(launch.getStream())
134+
135+
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
136+
137+
state.exec(launcher)
138+
139+
140+
def copy_type_sweep(state: nvbench.State):
141+
type_id = state.getInt64("TypeID")
142+
143+
types_map = {
144+
0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
145+
1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
146+
2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
147+
3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
148+
4: (ctypes.c_float, "float"),
149+
5: (ctypes.c_double, "double"),
150+
}
151+
152+
value_ctype, value_cuda_t = types_map[type_id]
153+
154+
# Number of elements in 256MiB
155+
nbytes = 256 * 1024 * 1024
156+
num_values = nbytes // ctypes.sizeof(value_ctype(0))
157+
158+
state.addElementCount(num_values)
159+
state.addGlobalMemoryReads(nbytes)
160+
state.addGlobalMemoryWrites(nbytes)
161+
162+
dev = core.Device(state.getDevice())
163+
dev.set_current()
164+
165+
alloc_stream = dev.create_stream(state.getStream())
166+
input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
167+
output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
168+
169+
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
170+
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
171+
172+
def launcher(launch: nvbench.Launch):
173+
dev = core.Device()
174+
dev.set_current()
175+
s = dev.create_stream(launch.getStream())
176+
177+
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
178+
179+
state.exec(launcher)
180+
181+
182+
if __name__ == "__main__":
183+
# Benchmark without axes
184+
nvbench.register(simple)
185+
186+
# benchmark with no axes, that uses default value
187+
nvbench.register(default_value)
188+
# specify axis
189+
nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
190+
191+
copy1_bench = nvbench.register(copy_sweep_grid_shape)
192+
copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
193+
copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
194+
195+
copy2_bench = nvbench.register(copy_type_sweep)
196+
copy2_bench.addInt64Axis("TypeID", range(0, 6))
197+
198+
nvbench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)