Skip to content

Commit 88a3ad0

Browse files
Add test/stub.py
The following static analysis run should run green ``` mypy --ignore-missing-imports test/stub.py ```
1 parent 9c01f22 commit 88a3ad0

File tree

1 file changed

+236
-0
lines changed

1 file changed

+236
-0
lines changed

python/test/stubs.py

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
# Copyright 2025 NVIDIA Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 with the LLVM exception
4+
# (the "License"); you may not use this file except in compliance with
5+
# the License.
6+
#
7+
# You may obtain a copy of the License at
8+
#
9+
# http://llvm.org/foundation/relicensing/LICENSE.txt
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import ctypes
18+
import sys
19+
from typing import Dict, Optional, Tuple
20+
21+
import cuda.cccl.headers as headers
22+
import cuda.core.experimental as core
23+
import cuda.nvbench as nvbench
24+
25+
26+
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
27+
return core.Stream.from_handle(cs.addressof())
28+
29+
30+
def make_sleep_kernel():
31+
"""JITs sleep_kernel(seconds)"""
32+
src = r"""
33+
#include <cuda/std/cstdint>
34+
#include <cuda/std/chrono>
35+
36+
// Each launched thread just sleeps for `seconds`.
37+
__global__ void sleep_kernel(double seconds) {
38+
namespace chrono = ::cuda::std::chrono;
39+
using hr_clock = chrono::high_resolution_clock;
40+
41+
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
42+
const auto ns = chrono::nanoseconds(duration);
43+
44+
const auto start = hr_clock::now();
45+
const auto finish = start + ns;
46+
47+
auto now = hr_clock::now();
48+
while (now < finish)
49+
{
50+
now = hr_clock::now();
51+
}
52+
}
53+
"""
54+
incl = headers.get_include_paths()
55+
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
56+
prog = core.Program(src, code_type="c++", options=opts)
57+
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
58+
return mod.get_kernel("sleep_kernel")
59+
60+
61+
def no_axes(state: nvbench.State):
62+
state.set_min_samples(1000)
63+
sleep_dur = 1e-3
64+
krn = make_sleep_kernel()
65+
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
66+
67+
print(f"Stopping criterion used: {state.get_stopping_criterion()}")
68+
69+
def launcher(launch: nvbench.Launch):
70+
s = as_core_Stream(launch.get_stream())
71+
core.launch(s, launch_config, krn, sleep_dur)
72+
73+
state.exec(launcher)
74+
75+
76+
def tags(state: nvbench.State):
77+
state.set_min_samples(1000)
78+
sleep_dur = 1e-3
79+
krn = make_sleep_kernel()
80+
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
81+
82+
sync_flag = bool(state.get_int64("Sync"))
83+
batched_flag = bool(state.get_int64("Batched"))
84+
85+
def launcher(launch: nvbench.Launch):
86+
s = as_core_Stream(launch.get_stream())
87+
core.launch(s, launch_config, krn, sleep_dur)
88+
89+
state.exec(launcher, sync=sync_flag, batched=batched_flag)
90+
91+
92+
def single_float64_axis(state: nvbench.State):
93+
# get axis value, or default
94+
default_sleep_dur = 3.14e-4
95+
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
96+
krn = make_sleep_kernel()
97+
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
98+
99+
def launcher(launch: nvbench.Launch):
100+
s = as_core_Stream(launch.get_stream())
101+
core.launch(s, launch_config, krn, sleep_dur)
102+
103+
state.exec(launcher)
104+
105+
106+
def default_value(state: nvbench.State):
107+
single_float64_axis(state)
108+
109+
110+
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
111+
src = r"""
112+
#include <cuda/std/cstdint>
113+
#include <cuda/std/cstddef>
114+
/*!
115+
* Naive copy of `n` values from `in` -> `out`.
116+
*/
117+
template <typename T, typename U>
118+
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
119+
{
120+
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
121+
const auto step = blockDim.x * gridDim.x;
122+
123+
for (auto i = init; i < n; i += step)
124+
{
125+
out[i] = static_cast<U>(in[i]);
126+
}
127+
}
128+
"""
129+
incl = headers.get_include_paths()
130+
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
131+
prog = core.Program(src, code_type="c++", options=opts)
132+
if in_type is None:
133+
in_type = "::cuda::std::int32_t"
134+
if out_type is None:
135+
out_type = "::cuda::std::int32_t"
136+
instance_name = f"copy_kernel<{in_type}, {out_type}>"
137+
mod = prog.compile("cubin", name_expressions=(instance_name,))
138+
return mod.get_kernel(instance_name)
139+
140+
141+
def copy_sweep_grid_shape(state: nvbench.State):
142+
block_size = state.get_int64("BlockSize")
143+
num_blocks = state.get_int64("NumBlocks")
144+
145+
# Number of int32 elements in 256MiB
146+
nbytes = 256 * 1024 * 1024
147+
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
148+
149+
state.add_element_count(num_values)
150+
state.add_global_memory_reads(nbytes)
151+
state.add_global_memory_writes(nbytes)
152+
153+
dev_id = state.get_device()
154+
alloc_s = as_core_Stream(state.get_stream())
155+
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
156+
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
157+
158+
krn = make_copy_kernel()
159+
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
160+
161+
def launcher(launch: nvbench.Launch):
162+
s = as_core_Stream(launch.get_stream())
163+
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
164+
165+
state.exec(launcher)
166+
167+
168+
def copy_type_sweep(state: nvbench.State):
169+
type_id = state.get_int64("TypeID")
170+
171+
types_map: Dict[int, Tuple[type, str]] = {
172+
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
173+
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
174+
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
175+
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
176+
4: (ctypes.c_float, "float"),
177+
5: (ctypes.c_double, "double"),
178+
}
179+
180+
value_ctype, value_cuda_t = types_map[type_id]
181+
state.add_summary("Type", value_cuda_t)
182+
183+
# Number of elements in 256MiB
184+
nbytes = 256 * 1024 * 1024
185+
num_values = nbytes // ctypes.sizeof(value_ctype)
186+
187+
state.add_element_count(num_values)
188+
state.add_global_memory_reads(nbytes)
189+
state.add_global_memory_writes(nbytes)
190+
191+
dev_id = state.get_device()
192+
alloc_s = as_core_Stream(state.get_stream())
193+
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
194+
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
195+
196+
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
197+
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
198+
199+
def launcher(launch: nvbench.Launch):
200+
s = as_core_Stream(launch.get_stream())
201+
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
202+
203+
state.exec(launcher)
204+
205+
206+
if __name__ == "__main__":
207+
# Benchmark without axes
208+
simple_b = nvbench.register(no_axes)
209+
simple_b.set_stopping_criterion("entropy")
210+
simple_b.set_criterion_param_int64("unused_int", 100)
211+
212+
tags_b = nvbench.register(tags)
213+
tags_b.add_int64_axis("Sync", [0, 1])
214+
tags_b.add_int64_axis("Batched", [0, 1])
215+
216+
# benchmark with no axes, that uses default value
217+
default_b = nvbench.register(default_value)
218+
default_b.set_min_samples(7)
219+
220+
# specify axis
221+
axes_b = nvbench.register(single_float64_axis).add_float64_axis(
222+
"Duration", [7e-5, 1e-4, 5e-4]
223+
)
224+
axes_b.set_timeout(20)
225+
axes_b.set_skip_time(1e-5)
226+
axes_b.set_throttle_threshold(0.2)
227+
axes_b.set_throttle_recovery_delay(0.1)
228+
229+
copy1_bench = nvbench.register(copy_sweep_grid_shape)
230+
copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
231+
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
232+
233+
copy2_bench = nvbench.register(copy_type_sweep)
234+
copy2_bench.add_int64_axis("TypeID", range(0, 6))
235+
236+
nvbench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)