Skip to content

Commit 70723ec

Browse files
committed
Dynamically increase recovery delay for consecutive discards.
1 parent 89bec09 commit 70723ec

File tree

2 files changed

+25
-5
lines changed

2 files changed

+25
-5
lines changed

nvbench/detail/measure_cold.cu

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ void measure_cold_base::initialize()
8181
m_total_samples = 0;
8282
m_max_time_exceeded = false;
8383

84+
m_dynamic_throttle_recovery_delay = m_throttle_recovery_delay;
85+
m_throttle_discard_count = 0;
86+
8487
m_cuda_times.clear();
8588
m_cpu_times.clear();
8689

@@ -99,29 +102,41 @@ void measure_cold_base::record_measurements()
99102

100103
if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
101104
{
105+
if (m_throttle_discard_count > 2)
106+
{
107+
// Throttling detected in multiple consecutive trials. The delay is not sufficient to
108+
// recover. Increase the delay by no more than half of a second:
109+
m_dynamic_throttle_recovery_delay += std::min(m_dynamic_throttle_recovery_delay * 1.5f,
110+
0.5f);
111+
}
112+
102113
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
103114
{
104115
auto &printer = printer_opt_ref.value().get();
105116
printer.log(nvbench::log_level::warn,
106117
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
107-
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
108-
"and pausing for {}s.",
118+
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous trial "
119+
"and pausing for {:0.3f}s.",
109120
current_clock_rate / 1000000.0f,
110121
default_clock_rate / 1000000.0f,
111122
100.0f * (current_clock_rate / default_clock_rate),
112123
100.0f * m_throttle_threshold,
113124
m_total_samples,
114-
m_throttle_recovery_delay));
125+
m_dynamic_throttle_recovery_delay));
115126
}
116127

117-
if (m_throttle_recovery_delay > 0.0f)
128+
if (m_dynamic_throttle_recovery_delay > 0.0f)
118129
{ // let the GPU cool down
119-
std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
130+
std::this_thread::sleep_for(
131+
std::chrono::duration<float>(m_dynamic_throttle_recovery_delay));
120132
}
121133

134+
m_throttle_discard_count += 1;
135+
122136
// ignore this measurement
123137
return;
124138
}
139+
m_throttle_discard_count = 0;
125140

126141
m_sm_clock_rate_accumulator += current_clock_rate;
127142
}

nvbench/detail/measure_cold.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ protected:
104104
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
105105
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
106106

107+
// Dynamically increased when repeated throttling occurs
108+
// without successfully recorering a sample.
109+
nvbench::float32_t m_dynamic_throttle_recovery_delay{}; // [seconds]
110+
nvbench::int64_t m_throttle_discard_count{};
111+
107112
nvbench::int64_t m_total_samples{};
108113

109114
nvbench::float64_t m_min_cuda_time{};

0 commit comments

Comments
 (0)