Skip to content

Commit 3f1feb2

Browse files
Add Python example based on C++ example/auto_throughput.cpp
1 parent f6aeb02 commit 3f1feb2

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

python/examples/auto_throughput.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright 2025 NVIDIA Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 with the LLVM exception
4+
# (the "License"); you may not use this file except in compliance with
5+
# the License.
6+
#
7+
# You may obtain a copy of the License at
8+
#
9+
# http://llvm.org/foundation/relicensing/LICENSE.txt
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import sys
18+
19+
import cuda.nvbench as nvbench
20+
import numpy as np
21+
from numba import cuda
22+
23+
24+
def make_kernel(items_per_thread: int):
25+
@cuda.jit
26+
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
27+
tid = cuda.grid(1)
28+
step = cuda.gridDim.x * cuda.blockDim.x
29+
for i in range(stride * tid, stride * elements, stride * step):
30+
for j in range(items_per_thread):
31+
read_id = (items_per_thread * i + j) % elements
32+
write_id = tid + j * elements
33+
out_arr[write_id] = in_arr[read_id]
34+
35+
return kernel
36+
37+
38+
def throughput_bench(state: nvbench.State):
39+
stride = state.getInt64("Stride")
40+
ipt = state.getInt64("ItemsPerThread")
41+
42+
nbytes = 128 * 1024 * 1024
43+
elements = nbytes // np.dtype(np.int32).itemsize
44+
45+
alloc_stream = cuda.external_stream(state.getStream().addressof())
46+
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
47+
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
48+
49+
state.addElementCount(elements, "Elements")
50+
state.collectCUPTIMetrics()
51+
52+
threads_per_block = 256
53+
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
54+
55+
krn = make_kernel(ipt)
56+
57+
def launcher(launch: nvbench.Launch):
58+
exec_stream = cuda.external_stream(launch.getStream().addressof())
59+
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
60+
stride, elements, inp_arr, out_arr
61+
)
62+
63+
state.exec(launcher)
64+
65+
66+
(
67+
nvbench.register(throughput_bench)
68+
.addInt64Axis("Stride", [1, 4])
69+
.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
70+
)
71+
72+
73+
if __name__ == "__main__":
74+
print(nvbench.__version__)
75+
nvbench.run_all_benchmarks(sys.argv)

0 commit comments

Comments
 (0)