2525#include < nvbench/state.cuh>
2626#include < nvbench/summary.cuh>
2727
28+ #include < algorithm>
29+ #include < limits>
30+
2831#include < fmt/format.h>
2932
3033namespace nvbench ::detail
@@ -64,13 +67,17 @@ void measure_cold_base::check()
6467
6568void measure_cold_base::initialize ()
6669{
67- m_total_cuda_time = 0 .;
68- m_total_cpu_time = 0 .;
69- m_cpu_noise = 0 .;
70- m_total_samples = 0 ;
70+ m_min_cuda_time = std::numeric_limits<nvbench::float64_t >::max ();
71+ m_max_cuda_time = std::numeric_limits<nvbench::float64_t >::lowest ();
72+ m_total_cuda_time = 0 .;
73+ m_min_cpu_time = std::numeric_limits<nvbench::float64_t >::max ();
74+ m_max_cpu_time = std::numeric_limits<nvbench::float64_t >::lowest ();
75+ m_total_cpu_time = 0 .;
76+ m_total_samples = 0 ;
77+ m_max_time_exceeded = false ;
78+
7179 m_cuda_times.clear ();
7280 m_cpu_times.clear ();
73- m_max_time_exceeded = false ;
7481
7582 m_stopping_criterion.initialize (m_criterion_params);
7683}
@@ -82,10 +89,17 @@ void measure_cold_base::record_measurements()
8289 // Update and record timers and counters:
8390 const auto cur_cuda_time = m_cuda_timer.get_duration ();
8491 const auto cur_cpu_time = m_cpu_timer.get_duration ();
85- m_cuda_times.push_back (cur_cuda_time);
86- m_cpu_times.push_back (cur_cpu_time);
92+
93+ m_min_cuda_time = std::min (m_min_cuda_time, cur_cuda_time);
94+ m_max_cuda_time = std::max (m_max_cuda_time, cur_cuda_time);
8795 m_total_cuda_time += cur_cuda_time;
96+ m_cuda_times.push_back (cur_cuda_time);
97+
98+ m_min_cpu_time = std::min (m_min_cpu_time, cur_cpu_time);
99+ m_max_cpu_time = std::max (m_max_cpu_time, cur_cpu_time);
88100 m_total_cpu_time += cur_cpu_time;
101+ m_cpu_times.push_back (cur_cpu_time);
102+
89103 ++m_total_samples;
90104
91105 m_stopping_criterion.add_measurement (cur_cuda_time);
@@ -118,21 +132,10 @@ bool measure_cold_base::is_finished()
118132 return false ;
119133}
120134
121- void measure_cold_base::run_trials_epilogue ()
122- {
123- // Only need to compute this at the end, not per iteration.
124- const auto cpu_mean = m_total_cpu_time / static_cast <nvbench::float64_t >(m_total_samples);
125- const auto cpu_stdev = nvbench::detail::statistics::standard_deviation (m_cpu_times.cbegin (),
126- m_cpu_times.cend (),
127- cpu_mean);
128- m_cpu_noise = cpu_stdev / cpu_mean;
129-
130- m_walltime_timer.stop ();
131- }
135+ void measure_cold_base::run_trials_epilogue () { m_walltime_timer.stop (); }
132136
133137void measure_cold_base::generate_summaries ()
134138{
135- const auto d_samples = static_cast <double >(m_total_samples);
136139 {
137140 auto &summ = m_state.add_summary (" nv/cold/sample_size" );
138141 summ.set_string (" name" , " Samples" );
@@ -141,51 +144,113 @@ void measure_cold_base::generate_summaries()
141144 summ.set_int64 (" value" , m_total_samples);
142145 }
143146
144- const auto avg_cpu_time = m_total_cpu_time / d_samples;
147+ {
148+ auto &summ = m_state.add_summary (" nv/cold/time/cpu/min" );
149+ summ.set_string (" name" , " Min CPU Time" );
150+ summ.set_string (" hint" , " duration" );
151+ summ.set_string (" description" ,
152+ " Fastest isolated kernel execution time "
153+ " (measured on host CPU)" );
154+ summ.set_float64 (" value" , m_min_cpu_time);
155+ summ.set_string (" hide" , " Hidden by default." );
156+ }
157+
158+ {
159+ auto &summ = m_state.add_summary (" nv/cold/time/cpu/max" );
160+ summ.set_string (" name" , " Max CPU Time" );
161+ summ.set_string (" hint" , " duration" );
162+ summ.set_string (" description" ,
163+ " Slowest isolated kernel execution time "
164+ " (measured on host CPU)" );
165+ summ.set_float64 (" value" , m_max_cpu_time);
166+ summ.set_string (" hide" , " Hidden by default." );
167+ }
168+
169+ const auto d_samples = static_cast <double >(m_total_samples);
170+ const auto cpu_mean = m_total_cpu_time / d_samples;
145171 {
146172 auto &summ = m_state.add_summary (" nv/cold/time/cpu/mean" );
147173 summ.set_string (" name" , " CPU Time" );
148174 summ.set_string (" hint" , " duration" );
149175 summ.set_string (" description" ,
150176 " Mean isolated kernel execution time "
151177 " (measured on host CPU)" );
152- summ.set_float64 (" value" , avg_cpu_time);
178+ summ.set_float64 (" value" , cpu_mean);
179+ }
180+
181+ const auto cpu_stdev = nvbench::detail::statistics::standard_deviation (m_cpu_times.cbegin (),
182+ m_cpu_times.cend (),
183+ cpu_mean);
184+ {
185+ auto &summ = m_state.add_summary (" nv/cold/time/cpu/stdev/absolute" );
186+ summ.set_string (" name" , " Noise" );
187+ summ.set_string (" hint" , " percentage" );
188+ summ.set_string (" description" , " Relative standard deviation of isolated CPU times" );
189+ summ.set_float64 (" value" , cpu_stdev);
190+ summ.set_string (" hide" , " Hidden by default." );
153191 }
154192
193+ const auto cpu_noise = cpu_stdev / cpu_mean;
155194 {
156195 auto &summ = m_state.add_summary (" nv/cold/time/cpu/stdev/relative" );
157196 summ.set_string (" name" , " Noise" );
158197 summ.set_string (" hint" , " percentage" );
159198 summ.set_string (" description" , " Relative standard deviation of isolated CPU times" );
160- summ.set_float64 (" value" , m_cpu_noise);
199+ summ.set_float64 (" value" , cpu_noise);
200+ }
201+
202+ {
203+ auto &summ = m_state.add_summary (" nv/cold/time/gpu/min" );
204+ summ.set_string (" name" , " Min GPU Time" );
205+ summ.set_string (" hint" , " duration" );
206+ summ.set_string (" description" ,
207+ " Fastest isolated kernel execution time "
208+ " (measured with CUDA events)" );
209+ summ.set_float64 (" value" , m_min_cuda_time);
210+ summ.set_string (" hide" , " Hidden by default." );
211+ }
212+
213+ {
214+ auto &summ = m_state.add_summary (" nv/cold/time/gpu/max" );
215+ summ.set_string (" name" , " Max GPU Time" );
216+ summ.set_string (" hint" , " duration" );
217+ summ.set_string (" description" ,
218+ " Slowest isolated kernel execution time "
219+ " (measured with CUDA events)" );
220+ summ.set_float64 (" value" , m_max_cuda_time);
221+ summ.set_string (" hide" , " Hidden by default." );
161222 }
162223
163- const auto avg_cuda_time = m_total_cuda_time / d_samples;
224+ const auto cuda_mean = m_total_cuda_time / d_samples;
164225 {
165226 auto &summ = m_state.add_summary (" nv/cold/time/gpu/mean" );
166227 summ.set_string (" name" , " GPU Time" );
167228 summ.set_string (" hint" , " duration" );
168229 summ.set_string (" description" ,
169230 " Mean isolated kernel execution time "
170231 " (measured with CUDA events)" );
171- summ.set_float64 (" value" , avg_cuda_time );
232+ summ.set_float64 (" value" , cuda_mean );
172233 }
173234
174- const auto mean_cuda_time = m_total_cuda_time / static_cast <nvbench::float64_t >(m_total_samples);
175- const auto cuda_stdev = nvbench::detail::statistics::standard_deviation (m_cuda_times.cbegin (),
235+ const auto cuda_stdev = nvbench::detail::statistics::standard_deviation (m_cuda_times.cbegin (),
176236 m_cuda_times.cend (),
177- mean_cuda_time);
178- const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
179- const auto noise = cuda_rel_stdev;
180- const auto max_noise = m_criterion_params.get_float64 (" max-noise" );
181- const auto min_time = m_criterion_params.get_float64 (" min-time" );
237+ cuda_mean);
238+ {
239+ auto &summ = m_state.add_summary (" nv/cold/time/gpu/stdev/absolute" );
240+ summ.set_string (" name" , " Noise" );
241+ summ.set_string (" hint" , " percentage" );
242+ summ.set_string (" description" , " Relative standard deviation of isolated GPU times" );
243+ summ.set_float64 (" value" , cuda_stdev);
244+ summ.set_string (" hide" , " Hidden by default." );
245+ }
182246
247+ const auto cuda_noise = cuda_stdev / cuda_mean;
183248 {
184249 auto &summ = m_state.add_summary (" nv/cold/time/gpu/stdev/relative" );
185250 summ.set_string (" name" , " Noise" );
186251 summ.set_string (" hint" , " percentage" );
187252 summ.set_string (" description" , " Relative standard deviation of isolated GPU times" );
188- summ.set_float64 (" value" , noise );
253+ summ.set_float64 (" value" , cuda_noise );
189254 }
190255
191256 if (const auto items = m_state.get_element_count (); items != 0 )
@@ -194,12 +259,12 @@ void measure_cold_base::generate_summaries()
194259 summ.set_string (" name" , " Elem/s" );
195260 summ.set_string (" hint" , " item_rate" );
196261 summ.set_string (" description" , " Number of input elements processed per second" );
197- summ.set_float64 (" value" , static_cast <double >(items) / avg_cuda_time );
262+ summ.set_float64 (" value" , static_cast <double >(items) / cuda_mean );
198263 }
199264
200265 if (const auto bytes = m_state.get_global_memory_rw_bytes (); bytes != 0 )
201266 {
202- const auto avg_used_gmem_bw = static_cast <double >(bytes) / avg_cuda_time ;
267+ const auto avg_used_gmem_bw = static_cast <double >(bytes) / cuda_mean ;
203268 {
204269 auto &summ = m_state.add_summary (" nv/cold/bw/global/bytes_per_second" );
205270 summ.set_string (" name" , " GlobalMem BW" );
@@ -240,16 +305,18 @@ void measure_cold_base::generate_summaries()
240305
241306 if (m_max_time_exceeded)
242307 {
243- const auto timeout = m_walltime_timer.get_duration ();
308+ const auto timeout = m_walltime_timer.get_duration ();
309+ const auto max_noise = m_criterion_params.get_float64 (" max-noise" );
310+ const auto min_time = m_criterion_params.get_float64 (" min-time" );
244311
245- if (noise > max_noise)
312+ if (cuda_noise > max_noise)
246313 {
247314 printer.log (nvbench::log_level::warn,
248315 fmt::format (" Current measurement timed out ({:0.2f}s) "
249316 " while over noise threshold ({:0.2f}% > "
250317 " {:0.2f}%)" ,
251318 timeout,
252- noise * 100 ,
319+ cuda_noise * 100 ,
253320 max_noise * 100 ));
254321 }
255322 if (m_total_samples < m_min_samples)
@@ -277,8 +344,8 @@ void measure_cold_base::generate_summaries()
277344 printer.log (nvbench::log_level::pass,
278345 fmt::format (" Cold: {:0.6f}ms GPU, {:0.6f}ms CPU, {:0.2f}s "
279346 " total GPU, {:0.2f}s total wall, {}x " ,
280- avg_cuda_time * 1e3 ,
281- avg_cpu_time * 1e3 ,
347+ cuda_mean * 1e3 ,
348+ cpu_mean * 1e3 ,
282349 m_total_cuda_time,
283350 m_walltime_timer.get_duration (),
284351 m_total_samples));
0 commit comments