Skip to content

Commit 36adf3a

Browse files
authored
Merge pull request #204 from alliepiper/summaries
Add min/max timings, new "summaries" example.
2 parents beca2c0 + 2ba8acd commit 36adf3a

File tree

7 files changed

+243
-68
lines changed

7 files changed

+243
-68
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ various NVBench features and usecases:
7373
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
7474
- [Skipping benchmark configurations](examples/skip.cu)
7575
- [Benchmarking on a specific stream](examples/stream.cu)
76+
- [Adding / hiding columns (summaries) in markdown output](examples/summaries.cu)
7677
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
7778
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
7879

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ set(example_srcs
88
exec_tag_timer.cu
99
skip.cu
1010
stream.cu
11+
summaries.cu
1112
throughput.cu
1213
)
1314

examples/summaries.cu

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#include <nvbench/nvbench.cuh>
20+
21+
// Grab some testing kernels from NVBench:
22+
#include <nvbench/test_kernels.cuh>
23+
24+
// #define PRINT_DEFAULT_SUMMARY_TAGS
25+
26+
void summary_example(nvbench::state &state)
27+
{
28+
// Fetch parameters and compute duration in seconds:
29+
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
30+
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
31+
const auto duration = ms * 1e-3 + us * 1e-6;
32+
33+
// Add a new column to the summary table with the derived duration used by the benchmark.
34+
// See the documentation in nvbench/summary.cuh for more details.
35+
{
36+
nvbench::summary &summary = state.add_summary("duration");
37+
summary.set_string("name", "Duration (s)");
38+
summary.set_string("description", "The duration of the kernel execution.");
39+
summary.set_string("hint", "duration");
40+
summary.set_float64("value", duration);
41+
}
42+
43+
// Run the measurements:
44+
state.exec([duration](nvbench::launch &launch) {
45+
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
46+
});
47+
48+
#ifdef PRINT_DEFAULT_SUMMARY_TAGS
49+
// The default summary tags can be found by inspecting the state after calling
50+
// state.exec.
51+
// They can also be found by looking at the json output (--json <filename>)
52+
for (const auto &summary : state.get_summaries())
53+
{
54+
std::cout << summary.get_tag() << std::endl;
55+
}
56+
#endif
57+
58+
// Default summary columns can be shown/hidden in the markdown output tables by adding/removing
59+
// the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
60+
// means.
61+
state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
62+
state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
63+
state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
64+
state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
65+
state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
66+
state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
67+
}
68+
NVBENCH_BENCH(summary_example)
69+
.add_int64_axis("ms", nvbench::range(10, 50, 20))
70+
.add_int64_axis("us", nvbench::range(100, 500, 200));

nvbench/detail/measure_cold.cu

Lines changed: 106 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
#include <nvbench/state.cuh>
2626
#include <nvbench/summary.cuh>
2727

28+
#include <algorithm>
29+
#include <limits>
30+
2831
#include <fmt/format.h>
2932

3033
namespace nvbench::detail
@@ -64,13 +67,17 @@ void measure_cold_base::check()
6467

6568
void measure_cold_base::initialize()
6669
{
67-
m_total_cuda_time = 0.;
68-
m_total_cpu_time = 0.;
69-
m_cpu_noise = 0.;
70-
m_total_samples = 0;
70+
m_min_cuda_time = std::numeric_limits<nvbench::float64_t>::max();
71+
m_max_cuda_time = std::numeric_limits<nvbench::float64_t>::lowest();
72+
m_total_cuda_time = 0.;
73+
m_min_cpu_time = std::numeric_limits<nvbench::float64_t>::max();
74+
m_max_cpu_time = std::numeric_limits<nvbench::float64_t>::lowest();
75+
m_total_cpu_time = 0.;
76+
m_total_samples = 0;
77+
m_max_time_exceeded = false;
78+
7179
m_cuda_times.clear();
7280
m_cpu_times.clear();
73-
m_max_time_exceeded = false;
7481

7582
m_stopping_criterion.initialize(m_criterion_params);
7683
}
@@ -82,10 +89,17 @@ void measure_cold_base::record_measurements()
8289
// Update and record timers and counters:
8390
const auto cur_cuda_time = m_cuda_timer.get_duration();
8491
const auto cur_cpu_time = m_cpu_timer.get_duration();
85-
m_cuda_times.push_back(cur_cuda_time);
86-
m_cpu_times.push_back(cur_cpu_time);
92+
93+
m_min_cuda_time = std::min(m_min_cuda_time, cur_cuda_time);
94+
m_max_cuda_time = std::max(m_max_cuda_time, cur_cuda_time);
8795
m_total_cuda_time += cur_cuda_time;
96+
m_cuda_times.push_back(cur_cuda_time);
97+
98+
m_min_cpu_time = std::min(m_min_cpu_time, cur_cpu_time);
99+
m_max_cpu_time = std::max(m_max_cpu_time, cur_cpu_time);
88100
m_total_cpu_time += cur_cpu_time;
101+
m_cpu_times.push_back(cur_cpu_time);
102+
89103
++m_total_samples;
90104

91105
m_stopping_criterion.add_measurement(cur_cuda_time);
@@ -118,21 +132,10 @@ bool measure_cold_base::is_finished()
118132
return false;
119133
}
120134

121-
void measure_cold_base::run_trials_epilogue()
122-
{
123-
// Only need to compute this at the end, not per iteration.
124-
const auto cpu_mean = m_total_cpu_time / static_cast<nvbench::float64_t>(m_total_samples);
125-
const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
126-
m_cpu_times.cend(),
127-
cpu_mean);
128-
m_cpu_noise = cpu_stdev / cpu_mean;
129-
130-
m_walltime_timer.stop();
131-
}
135+
void measure_cold_base::run_trials_epilogue() { m_walltime_timer.stop(); }
132136

133137
void measure_cold_base::generate_summaries()
134138
{
135-
const auto d_samples = static_cast<double>(m_total_samples);
136139
{
137140
auto &summ = m_state.add_summary("nv/cold/sample_size");
138141
summ.set_string("name", "Samples");
@@ -141,51 +144,113 @@ void measure_cold_base::generate_summaries()
141144
summ.set_int64("value", m_total_samples);
142145
}
143146

144-
const auto avg_cpu_time = m_total_cpu_time / d_samples;
147+
{
148+
auto &summ = m_state.add_summary("nv/cold/time/cpu/min");
149+
summ.set_string("name", "Min CPU Time");
150+
summ.set_string("hint", "duration");
151+
summ.set_string("description",
152+
"Fastest isolated kernel execution time "
153+
"(measured on host CPU)");
154+
summ.set_float64("value", m_min_cpu_time);
155+
summ.set_string("hide", "Hidden by default.");
156+
}
157+
158+
{
159+
auto &summ = m_state.add_summary("nv/cold/time/cpu/max");
160+
summ.set_string("name", "Max CPU Time");
161+
summ.set_string("hint", "duration");
162+
summ.set_string("description",
163+
"Slowest isolated kernel execution time "
164+
"(measured on host CPU)");
165+
summ.set_float64("value", m_max_cpu_time);
166+
summ.set_string("hide", "Hidden by default.");
167+
}
168+
169+
const auto d_samples = static_cast<double>(m_total_samples);
170+
const auto cpu_mean = m_total_cpu_time / d_samples;
145171
{
146172
auto &summ = m_state.add_summary("nv/cold/time/cpu/mean");
147173
summ.set_string("name", "CPU Time");
148174
summ.set_string("hint", "duration");
149175
summ.set_string("description",
150176
"Mean isolated kernel execution time "
151177
"(measured on host CPU)");
152-
summ.set_float64("value", avg_cpu_time);
178+
summ.set_float64("value", cpu_mean);
179+
}
180+
181+
const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
182+
m_cpu_times.cend(),
183+
cpu_mean);
184+
{
185+
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute");
186+
summ.set_string("name", "Noise");
187+
summ.set_string("hint", "percentage");
188+
summ.set_string("description", "Relative standard deviation of isolated CPU times");
189+
summ.set_float64("value", cpu_stdev);
190+
summ.set_string("hide", "Hidden by default.");
153191
}
154192

193+
const auto cpu_noise = cpu_stdev / cpu_mean;
155194
{
156195
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
157196
summ.set_string("name", "Noise");
158197
summ.set_string("hint", "percentage");
159198
summ.set_string("description", "Relative standard deviation of isolated CPU times");
160-
summ.set_float64("value", m_cpu_noise);
199+
summ.set_float64("value", cpu_noise);
200+
}
201+
202+
{
203+
auto &summ = m_state.add_summary("nv/cold/time/gpu/min");
204+
summ.set_string("name", "Min GPU Time");
205+
summ.set_string("hint", "duration");
206+
summ.set_string("description",
207+
"Fastest isolated kernel execution time "
208+
"(measured with CUDA events)");
209+
summ.set_float64("value", m_min_cuda_time);
210+
summ.set_string("hide", "Hidden by default.");
211+
}
212+
213+
{
214+
auto &summ = m_state.add_summary("nv/cold/time/gpu/max");
215+
summ.set_string("name", "Max GPU Time");
216+
summ.set_string("hint", "duration");
217+
summ.set_string("description",
218+
"Slowest isolated kernel execution time "
219+
"(measured with CUDA events)");
220+
summ.set_float64("value", m_max_cuda_time);
221+
summ.set_string("hide", "Hidden by default.");
161222
}
162223

163-
const auto avg_cuda_time = m_total_cuda_time / d_samples;
224+
const auto cuda_mean = m_total_cuda_time / d_samples;
164225
{
165226
auto &summ = m_state.add_summary("nv/cold/time/gpu/mean");
166227
summ.set_string("name", "GPU Time");
167228
summ.set_string("hint", "duration");
168229
summ.set_string("description",
169230
"Mean isolated kernel execution time "
170231
"(measured with CUDA events)");
171-
summ.set_float64("value", avg_cuda_time);
232+
summ.set_float64("value", cuda_mean);
172233
}
173234

174-
const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
175-
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
235+
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
176236
m_cuda_times.cend(),
177-
mean_cuda_time);
178-
const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
179-
const auto noise = cuda_rel_stdev;
180-
const auto max_noise = m_criterion_params.get_float64("max-noise");
181-
const auto min_time = m_criterion_params.get_float64("min-time");
237+
cuda_mean);
238+
{
239+
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute");
240+
summ.set_string("name", "Noise");
241+
summ.set_string("hint", "percentage");
242+
summ.set_string("description", "Relative standard deviation of isolated GPU times");
243+
summ.set_float64("value", cuda_stdev);
244+
summ.set_string("hide", "Hidden by default.");
245+
}
182246

247+
const auto cuda_noise = cuda_stdev / cuda_mean;
183248
{
184249
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
185250
summ.set_string("name", "Noise");
186251
summ.set_string("hint", "percentage");
187252
summ.set_string("description", "Relative standard deviation of isolated GPU times");
188-
summ.set_float64("value", noise);
253+
summ.set_float64("value", cuda_noise);
189254
}
190255

191256
if (const auto items = m_state.get_element_count(); items != 0)
@@ -194,12 +259,12 @@ void measure_cold_base::generate_summaries()
194259
summ.set_string("name", "Elem/s");
195260
summ.set_string("hint", "item_rate");
196261
summ.set_string("description", "Number of input elements processed per second");
197-
summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
262+
summ.set_float64("value", static_cast<double>(items) / cuda_mean);
198263
}
199264

200265
if (const auto bytes = m_state.get_global_memory_rw_bytes(); bytes != 0)
201266
{
202-
const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
267+
const auto avg_used_gmem_bw = static_cast<double>(bytes) / cuda_mean;
203268
{
204269
auto &summ = m_state.add_summary("nv/cold/bw/global/bytes_per_second");
205270
summ.set_string("name", "GlobalMem BW");
@@ -240,16 +305,18 @@ void measure_cold_base::generate_summaries()
240305

241306
if (m_max_time_exceeded)
242307
{
243-
const auto timeout = m_walltime_timer.get_duration();
308+
const auto timeout = m_walltime_timer.get_duration();
309+
const auto max_noise = m_criterion_params.get_float64("max-noise");
310+
const auto min_time = m_criterion_params.get_float64("min-time");
244311

245-
if (noise > max_noise)
312+
if (cuda_noise > max_noise)
246313
{
247314
printer.log(nvbench::log_level::warn,
248315
fmt::format("Current measurement timed out ({:0.2f}s) "
249316
"while over noise threshold ({:0.2f}% > "
250317
"{:0.2f}%)",
251318
timeout,
252-
noise * 100,
319+
cuda_noise * 100,
253320
max_noise * 100));
254321
}
255322
if (m_total_samples < m_min_samples)
@@ -277,8 +344,8 @@ void measure_cold_base::generate_summaries()
277344
printer.log(nvbench::log_level::pass,
278345
fmt::format("Cold: {:0.6f}ms GPU, {:0.6f}ms CPU, {:0.2f}s "
279346
"total GPU, {:0.2f}s total wall, {}x ",
280-
avg_cuda_time * 1e3,
281-
avg_cpu_time * 1e3,
347+
cuda_mean * 1e3,
348+
cpu_mean * 1e3,
282349
m_total_cuda_time,
283350
m_walltime_timer.get_duration(),
284351
m_total_samples));

nvbench/detail/measure_cold.cuh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,14 @@ protected:
9898
nvbench::float64_t m_timeout{};
9999

100100
nvbench::int64_t m_total_samples{};
101+
102+
nvbench::float64_t m_min_cuda_time{};
103+
nvbench::float64_t m_max_cuda_time{};
101104
nvbench::float64_t m_total_cuda_time{};
105+
106+
nvbench::float64_t m_min_cpu_time{};
107+
nvbench::float64_t m_max_cpu_time{};
102108
nvbench::float64_t m_total_cpu_time{};
103-
nvbench::float64_t m_cpu_noise{}; // rel stdev
104109

105110
std::vector<nvbench::float64_t> m_cuda_times;
106111
std::vector<nvbench::float64_t> m_cpu_times;

nvbench/detail/measure_cpu_only.cuh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,10 @@ protected:
7676
nvbench::float64_t m_timeout{};
7777

7878
nvbench::int64_t m_total_samples{};
79+
80+
nvbench::float64_t m_min_cpu_time{};
81+
nvbench::float64_t m_max_cpu_time{};
7982
nvbench::float64_t m_total_cpu_time{};
80-
nvbench::float64_t m_cpu_noise{}; // rel stdev
8183

8284
std::vector<nvbench::float64_t> m_cpu_times;
8385

0 commit comments

Comments
 (0)