2626#include < nvbench/summary.cuh>
2727
2828#include < algorithm>
29+ #include < chrono>
2930#include < limits>
31+ #include < thread>
3032
3133#include < fmt/format.h>
3234
@@ -44,11 +46,14 @@ measure_cold_base::measure_cold_base(state &exec_state)
4446 , m_min_samples{exec_state.get_min_samples ()}
4547 , m_skip_time{exec_state.get_skip_time ()}
4648 , m_timeout{exec_state.get_timeout ()}
49+ , m_throttle_threshold(exec_state.get_throttle_threshold())
50+ , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
4751{
4852 if (m_min_samples > 0 )
4953 {
5054 m_cuda_times.reserve (static_cast <std::size_t >(m_min_samples));
5155 m_cpu_times.reserve (static_cast <std::size_t >(m_min_samples));
56+ m_sm_clock_rates.reserve (static_cast <std::size_t >(m_min_samples));
5257 }
5358}
5459
@@ -78,6 +83,7 @@ void measure_cold_base::initialize()
7883
7984 m_cuda_times.clear ();
8085 m_cpu_times.clear ();
86+ m_sm_clock_rates.clear ();
8187
8288 m_stopping_criterion.initialize (m_criterion_params);
8389}
@@ -86,6 +92,40 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
8692
8793void measure_cold_base::record_measurements ()
8894{
95+ if (!m_run_once)
96+ {
97+ auto peak_clock_rate = static_cast <float >(m_state.get_device ()->get_sm_default_clock_rate ());
98+
99+ if (m_gpu_frequency.has_throttled (peak_clock_rate, m_throttle_threshold))
100+ {
101+ if (auto printer_opt_ref = m_state.get_benchmark ().get_printer (); printer_opt_ref.has_value ())
102+ {
103+ auto current_clock_rate = m_gpu_frequency.get_clock_frequency ();
104+ auto &printer = printer_opt_ref.value ().get ();
105+ printer.log (nvbench::log_level::warn,
106+ fmt::format (" GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
107+ " ({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
108+ " and pausing for {}s." ,
109+ current_clock_rate / 1000000 .0f ,
110+ peak_clock_rate / 1000000 .0f ,
111+ 100 .0f * (current_clock_rate / peak_clock_rate),
112+ 100 .0f * m_throttle_threshold,
113+ m_total_samples,
114+ m_throttle_recovery_delay));
115+ }
116+
117+ if (m_throttle_recovery_delay > 0 .0f )
118+ { // let the GPU cool down
119+ std::this_thread::sleep_for (std::chrono::duration<float >(m_throttle_recovery_delay));
120+ }
121+
122+ // ignore this measurement
123+ return ;
124+ }
125+
126+ m_sm_clock_rates.push_back (peak_clock_rate);
127+ }
128+
89129 // Update and record timers and counters:
90130 const auto cur_cuda_time = m_cuda_timer.get_duration ();
91131 const auto cur_cpu_time = m_cpu_timer.get_duration ();
@@ -298,6 +338,18 @@ void measure_cold_base::generate_summaries()
298338 summ.set_string (" hide" , " Hidden by default." );
299339 }
300340
341+ if (!m_sm_clock_rates.empty ())
342+ {
343+ auto &summ = m_state.add_summary (" nv/cold/sm_clock_rate/mean" );
344+ summ.set_string (" name" , " Clock Rate" );
345+ summ.set_string (" hint" , " frequency" );
346+ summ.set_string (" description" , " Mean SM clock rate" );
347+ summ.set_string (" hide" , " Hidden by default." );
348+ summ.set_float64 (" value" ,
349+ nvbench::detail::statistics::compute_mean (m_sm_clock_rates.cbegin (),
350+ m_sm_clock_rates.cend ()));
351+ }
352+
301353 // Log if a printer exists:
302354 if (auto printer_opt_ref = m_state.get_benchmark ().get_printer (); printer_opt_ref.has_value ())
303355 {
0 commit comments