Skip to content

Commit 89bec09

Browse files
authored
Merge pull request #207 from alliepiper/throttle_followup
Throttling followup
2 parents 46ab283 + 9bf5e98 commit 89bec09

File tree

8 files changed

+57
-39
lines changed

8 files changed

+57
-39
lines changed

docs/cli_help.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@
134134
before any `--benchmark` arguments.
135135

136136
* `--throttle-threshold <value>`
137-
* Set the GPU throttle threshold as percentage of the peak clock rate.
137+
* Set the GPU throttle threshold as percentage of the device's default clock rate.
138138
* Default is 75%.
139139
* Applies to the most recent `--benchmark`, or all benchmarks if specified
140140
before any `--benchmark` arguments.

examples/summaries.cu

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ void summary_example(nvbench::state &state)
4141
}
4242

4343
// Run the measurements:
44-
state.exec([duration](nvbench::launch &launch) {
44+
state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
4545
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
4646
});
4747

@@ -56,14 +56,17 @@ void summary_example(nvbench::state &state)
5656
#endif
5757

5858
// Default summary columns can be shown/hidden in the markdown output tables by adding/removing
59-
// the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
60-
// means.
59+
// the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
60+
// mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
6161
state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
6262
state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
6363
state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
64-
state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
65-
state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
6664
state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
65+
state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
66+
state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
67+
state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
68+
state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
69+
state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
6770
}
6871
NVBENCH_BENCH(summary_example)
6972
.add_int64_axis("ms", nvbench::range(10, 50, 20))

nvbench/benchmark_base.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ protected:
302302
nvbench::float64_t m_skip_time{-1.};
303303
nvbench::float64_t m_timeout{15.};
304304

305-
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
305+
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of default SM clock rate]
306306
nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
307307

308308
nvbench::criterion_params m_criterion_params;

nvbench/detail/gpu_frequency.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ struct gpu_frequency
4040

4141
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
4242

43-
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
43+
[[nodiscard]] bool has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
4444
nvbench::float32_t throttle_threshold);
4545

4646
[[nodiscard]] nvbench::float32_t get_clock_frequency();

nvbench/detail/gpu_frequency.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ nvbench::float32_t gpu_frequency::get_clock_frequency()
3131
return clock_rate;
3232
}
3333

34-
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
34+
bool gpu_frequency::has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
3535
nvbench::float32_t throttle_threshold)
3636
{
37-
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
37+
float threshold = default_sm_clock_rate_hz * throttle_threshold;
3838

3939
if (this->get_clock_frequency() < threshold)
4040
{

nvbench/detail/measure_cold.cu

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ measure_cold_base::measure_cold_base(state &exec_state)
5353
{
5454
m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
5555
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
56-
m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
5756
}
5857
}
5958

@@ -72,18 +71,18 @@ void measure_cold_base::check()
7271

7372
void measure_cold_base::initialize()
7473
{
75-
m_min_cuda_time = std::numeric_limits<nvbench::float64_t>::max();
76-
m_max_cuda_time = std::numeric_limits<nvbench::float64_t>::lowest();
77-
m_total_cuda_time = 0.;
78-
m_min_cpu_time = std::numeric_limits<nvbench::float64_t>::max();
79-
m_max_cpu_time = std::numeric_limits<nvbench::float64_t>::lowest();
80-
m_total_cpu_time = 0.;
81-
m_total_samples = 0;
82-
m_max_time_exceeded = false;
74+
m_min_cuda_time = std::numeric_limits<nvbench::float64_t>::max();
75+
m_max_cuda_time = std::numeric_limits<nvbench::float64_t>::lowest();
76+
m_total_cuda_time = 0.;
77+
m_min_cpu_time = std::numeric_limits<nvbench::float64_t>::max();
78+
m_max_cpu_time = std::numeric_limits<nvbench::float64_t>::lowest();
79+
m_total_cpu_time = 0.;
80+
m_sm_clock_rate_accumulator = 0.;
81+
m_total_samples = 0;
82+
m_max_time_exceeded = false;
8383

8484
m_cuda_times.clear();
8585
m_cpu_times.clear();
86-
m_sm_clock_rates.clear();
8786

8887
m_stopping_criterion.initialize(m_criterion_params);
8988
}
@@ -94,21 +93,22 @@ void measure_cold_base::record_measurements()
9493
{
9594
if (!m_run_once)
9695
{
97-
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
96+
const auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
97+
const auto default_clock_rate =
98+
static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
9899

99-
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
100+
if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
100101
{
101102
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
102103
{
103-
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
104-
auto &printer = printer_opt_ref.value().get();
104+
auto &printer = printer_opt_ref.value().get();
105105
printer.log(nvbench::log_level::warn,
106106
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
107107
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
108108
"and pausing for {}s.",
109109
current_clock_rate / 1000000.0f,
110-
peak_clock_rate / 1000000.0f,
111-
100.0f * (current_clock_rate / peak_clock_rate),
110+
default_clock_rate / 1000000.0f,
111+
100.0f * (current_clock_rate / default_clock_rate),
112112
100.0f * m_throttle_threshold,
113113
m_total_samples,
114114
m_throttle_recovery_delay));
@@ -123,7 +123,7 @@ void measure_cold_base::record_measurements()
123123
return;
124124
}
125125

126-
m_sm_clock_rates.push_back(peak_clock_rate);
126+
m_sm_clock_rate_accumulator += current_clock_rate;
127127
}
128128

129129
// Update and record timers and counters:
@@ -338,16 +338,30 @@ void measure_cold_base::generate_summaries()
338338
summ.set_string("hide", "Hidden by default.");
339339
}
340340

341-
if (!m_sm_clock_rates.empty())
341+
if (m_sm_clock_rate_accumulator != 0.)
342342
{
343-
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
344-
summ.set_string("name", "Clock Rate");
345-
summ.set_string("hint", "frequency");
346-
summ.set_string("description", "Mean SM clock rate");
347-
summ.set_string("hide", "Hidden by default.");
348-
summ.set_float64("value",
349-
nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
350-
m_sm_clock_rates.cend()));
343+
const auto clock_mean = m_sm_clock_rate_accumulator / d_samples;
344+
345+
{
346+
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
347+
summ.set_string("name", "Clock Rate");
348+
summ.set_string("hint", "frequency");
349+
summ.set_string("description", "Mean SM clock rate");
350+
summ.set_string("hide", "Hidden by default.");
351+
summ.set_float64("value", clock_mean);
352+
}
353+
354+
{
355+
const auto default_clock_rate =
356+
static_cast<nvbench::float64_t>(m_state.get_device()->get_sm_default_clock_rate());
357+
358+
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/scaling/percent");
359+
summ.set_string("name", "Clock Scaling");
360+
summ.set_string("hint", "percentage");
361+
summ.set_string("description", "Mean SM clock rate as a percentage of default clock rate.");
362+
summ.set_string("hide", "Hidden by default.");
363+
summ.set_float64("value", clock_mean / default_clock_rate);
364+
}
351365
}
352366

353367
// Log if a printer exists:

nvbench/detail/measure_cold.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ protected:
101101
nvbench::float64_t m_skip_time{};
102102
nvbench::float64_t m_timeout{};
103103

104-
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
104+
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
105105
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
106106

107107
nvbench::int64_t m_total_samples{};
@@ -114,9 +114,10 @@ protected:
114114
nvbench::float64_t m_max_cpu_time{};
115115
nvbench::float64_t m_total_cpu_time{};
116116

117+
nvbench::float64_t m_sm_clock_rate_accumulator{};
118+
117119
std::vector<nvbench::float64_t> m_cuda_times;
118120
std::vector<nvbench::float64_t> m_cpu_times;
119-
std::vector<nvbench::float32_t> m_sm_clock_rates;
120121

121122
bool m_max_time_exceeded{};
122123
};

nvbench/state.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ private:
331331
nvbench::float64_t m_skip_time;
332332
nvbench::float64_t m_timeout;
333333

334-
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
334+
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
335335
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
336336

337337
// Deadlock protection. See blocking_kernel's class doc for details.

0 commit comments

Comments
 (0)