Skip to content

Commit 87dd032

Browse files
authored
Merge pull request #206 from gevtushenko/throttle
Discard measurements while GPU is throttling
2 parents 36adf3a + 254ac25 commit 87dd032

17 files changed

+441
-43
lines changed

docs/cli_help.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,18 @@
133133
* Applies to the most recent `--benchmark`, or all benchmarks if specified
134134
before any `--benchmark` arguments.
135135

136+
* `--throttle-threshold <value>`
137+
* Set the GPU throttle threshold as percentage of the peak clock rate.
138+
* Default is 75%.
139+
* Applies to the most recent `--benchmark`, or all benchmarks if specified
140+
before any `--benchmark` arguments.
141+
142+
* `--throttle-recovery-delay <value>`
143+
* Set the GPU throttle recovery delay in seconds.
144+
* Default is 0.05 seconds.
145+
* Applies to the most recent `--benchmark`, or all benchmarks if specified
146+
before any `--benchmark` arguments.
147+
136148
* `--run-once`
137149
* Only run the benchmark once, skipping any warmup runs and batched
138150
measurements.

nvbench/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ set(srcs
2929
detail/measure_hot.cu
3030
detail/state_generator.cxx
3131
detail/stdrel_criterion.cxx
32+
detail/gpu_frequency.cxx
33+
detail/timestamps_kernel.cu
3234

3335
internal/nvml.cxx
3436
)

nvbench/benchmark_base.cuh

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,32 @@ struct benchmark_base
247247
}
248248
/// @}
249249

250-
[[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
251-
[[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
250+
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
251+
252+
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
253+
{
254+
m_throttle_threshold = throttle_threshold;
255+
}
256+
257+
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
258+
{
259+
return m_throttle_recovery_delay;
260+
}
261+
262+
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
263+
{
264+
m_throttle_recovery_delay = throttle_recovery_delay;
265+
}
266+
267+
[[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
268+
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
269+
{
270+
return m_criterion_params;
271+
}
252272

253273
/// Control the stopping criterion for the measurement loop.
254274
/// @{
255-
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
275+
[[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
256276
benchmark_base &set_stopping_criterion(std::string criterion)
257277
{
258278
m_stopping_criterion = std::move(criterion);
@@ -282,6 +302,9 @@ protected:
282302
nvbench::float64_t m_skip_time{-1.};
283303
nvbench::float64_t m_timeout{15.};
284304

305+
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
306+
nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
307+
285308
nvbench::criterion_params m_criterion_params;
286309
std::string m_stopping_criterion{"stdrel"};
287310

nvbench/benchmark_base.cxx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
*/
1818

1919
#include <nvbench/benchmark_base.cuh>
20-
2120
#include <nvbench/detail/transform_reduce.cuh>
2221

2322
namespace nvbench
@@ -45,7 +44,10 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
4544
result->m_skip_time = m_skip_time;
4645
result->m_timeout = m_timeout;
4746

48-
result->m_criterion_params = m_criterion_params;
47+
result->m_criterion_params = m_criterion_params;
48+
result->m_throttle_threshold = m_throttle_threshold;
49+
result->m_throttle_recovery_delay = m_throttle_recovery_delay;
50+
4951
result->m_stopping_criterion = m_stopping_criterion;
5052

5153
return result;

nvbench/detail/gpu_frequency.cuh

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#pragma once
20+
21+
#include <nvbench/detail/timestamps_kernel.cuh>
22+
#include <nvbench/types.cuh>
23+
24+
namespace nvbench::detail
25+
{
26+
27+
struct cuda_stream;
28+
29+
struct gpu_frequency
30+
{
31+
gpu_frequency() = default;
32+
33+
// move-only
34+
gpu_frequency(const gpu_frequency &) = delete;
35+
gpu_frequency(gpu_frequency &&) = default;
36+
gpu_frequency &operator=(const gpu_frequency &) = delete;
37+
gpu_frequency &operator=(gpu_frequency &&) = default;
38+
39+
void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
40+
41+
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
42+
43+
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
44+
nvbench::float32_t throttle_threshold);
45+
46+
[[nodiscard]] nvbench::float32_t get_clock_frequency();
47+
48+
private:
49+
nvbench::detail::timestamps_kernel m_start;
50+
nvbench::detail::timestamps_kernel m_stop;
51+
};
52+
53+
} // namespace nvbench::detail

nvbench/detail/gpu_frequency.cxx

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#include <nvbench/detail/gpu_frequency.cuh>
20+
21+
#include <iostream>
22+
23+
namespace nvbench::detail
24+
{
25+
26+
nvbench::float32_t gpu_frequency::get_clock_frequency()
27+
{
28+
nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
29+
nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
30+
nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
31+
return clock_rate;
32+
}
33+
34+
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
35+
nvbench::float32_t throttle_threshold)
36+
{
37+
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
38+
39+
if (this->get_clock_frequency() < threshold)
40+
{
41+
return true;
42+
}
43+
44+
return false;
45+
}
46+
47+
} // namespace nvbench::detail

nvbench/detail/measure_cold.cu

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
#include <nvbench/summary.cuh>
2727

2828
#include <algorithm>
29+
#include <chrono>
2930
#include <limits>
31+
#include <thread>
3032

3133
#include <fmt/format.h>
3234

@@ -44,11 +46,14 @@ measure_cold_base::measure_cold_base(state &exec_state)
4446
, m_min_samples{exec_state.get_min_samples()}
4547
, m_skip_time{exec_state.get_skip_time()}
4648
, m_timeout{exec_state.get_timeout()}
49+
, m_throttle_threshold(exec_state.get_throttle_threshold())
50+
, m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
4751
{
4852
if (m_min_samples > 0)
4953
{
5054
m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
5155
m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
56+
m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
5257
}
5358
}
5459

@@ -78,6 +83,7 @@ void measure_cold_base::initialize()
7883

7984
m_cuda_times.clear();
8085
m_cpu_times.clear();
86+
m_sm_clock_rates.clear();
8187

8288
m_stopping_criterion.initialize(m_criterion_params);
8389
}
@@ -86,6 +92,40 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
8692

8793
void measure_cold_base::record_measurements()
8894
{
95+
if (!m_run_once)
96+
{
97+
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
98+
99+
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
100+
{
101+
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
102+
{
103+
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
104+
auto &printer = printer_opt_ref.value().get();
105+
printer.log(nvbench::log_level::warn,
106+
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
107+
"({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
108+
"and pausing for {}s.",
109+
current_clock_rate / 1000000.0f,
110+
peak_clock_rate / 1000000.0f,
111+
100.0f * (current_clock_rate / peak_clock_rate),
112+
100.0f * m_throttle_threshold,
113+
m_total_samples,
114+
m_throttle_recovery_delay));
115+
}
116+
117+
if (m_throttle_recovery_delay > 0.0f)
118+
{ // let the GPU cool down
119+
std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
120+
}
121+
122+
// ignore this measurement
123+
return;
124+
}
125+
126+
m_sm_clock_rates.push_back(peak_clock_rate);
127+
}
128+
89129
// Update and record timers and counters:
90130
const auto cur_cuda_time = m_cuda_timer.get_duration();
91131
const auto cur_cpu_time = m_cpu_timer.get_duration();
@@ -298,6 +338,18 @@ void measure_cold_base::generate_summaries()
298338
summ.set_string("hide", "Hidden by default.");
299339
}
300340

341+
if (!m_sm_clock_rates.empty())
342+
{
343+
auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
344+
summ.set_string("name", "Clock Rate");
345+
summ.set_string("hint", "frequency");
346+
summ.set_string("description", "Mean SM clock rate");
347+
summ.set_string("hide", "Hidden by default.");
348+
summ.set_float64("value",
349+
nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
350+
m_sm_clock_rates.cend()));
351+
}
352+
301353
// Log if a printer exists:
302354
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
303355
{

nvbench/detail/measure_cold.cuh

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,26 @@
1818

1919
#pragma once
2020

21+
#include <cuda_runtime.h>
22+
2123
#include <nvbench/blocking_kernel.cuh>
2224
#include <nvbench/cpu_timer.cuh>
2325
#include <nvbench/cuda_call.cuh>
2426
#include <nvbench/cuda_timer.cuh>
27+
#include <nvbench/detail/gpu_frequency.cuh>
28+
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
29+
#include <nvbench/detail/l2flush.cuh>
30+
#include <nvbench/detail/statistics.cuh>
2531
#include <nvbench/device_info.cuh>
2632
#include <nvbench/exec_tag.cuh>
2733
#include <nvbench/launch.cuh>
2834
#include <nvbench/stopping_criterion.cuh>
2935

30-
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
31-
#include <nvbench/detail/l2flush.cuh>
32-
#include <nvbench/detail/statistics.cuh>
33-
34-
#include <cuda_runtime.h>
35-
3636
#include <utility>
3737
#include <vector>
3838

39+
#include "nvbench/types.cuh"
40+
3941
namespace nvbench
4042
{
4143

@@ -64,6 +66,8 @@ protected:
6466
bool is_finished();
6567
void run_trials_epilogue();
6668
void generate_summaries();
69+
void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
70+
void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
6771

6872
void check_skip_time(nvbench::float64_t warmup_time);
6973

@@ -87,7 +91,8 @@ protected:
8791
nvbench::blocking_kernel m_blocker;
8892

8993
nvbench::criterion_params m_criterion_params;
90-
nvbench::stopping_criterion_base& m_stopping_criterion;
94+
nvbench::stopping_criterion_base &m_stopping_criterion;
95+
nvbench::detail::gpu_frequency m_gpu_frequency;
9196

9297
bool m_disable_blocking_kernel{false};
9398
bool m_run_once{false};
@@ -97,6 +102,9 @@ protected:
97102
nvbench::float64_t m_skip_time{};
98103
nvbench::float64_t m_timeout{};
99104

105+
nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate]
106+
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
107+
100108
nvbench::int64_t m_total_samples{};
101109

102110
nvbench::float64_t m_min_cuda_time{};
@@ -109,6 +117,7 @@ protected:
109117

110118
std::vector<nvbench::float64_t> m_cuda_times;
111119
std::vector<nvbench::float64_t> m_cpu_times;
120+
std::vector<nvbench::float32_t> m_sm_clock_rates;
112121

113122
bool m_max_time_exceeded{};
114123
};
@@ -128,6 +137,10 @@ struct measure_cold_base::kernel_launch_timer
128137
{
129138
m_measure.block_stream();
130139
}
140+
if (!m_measure.m_run_once)
141+
{
142+
m_measure.gpu_frequency_start();
143+
}
131144
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
132145
if (m_disable_blocking_kernel)
133146
{
@@ -143,6 +156,10 @@ struct measure_cold_base::kernel_launch_timer
143156
m_measure.m_cpu_timer.start();
144157
m_measure.unblock_stream();
145158
}
159+
if (!m_measure.m_run_once)
160+
{
161+
m_measure.gpu_frequency_stop();
162+
}
146163
m_measure.sync_stream();
147164
m_measure.m_cpu_timer.stop();
148165
}

0 commit comments

Comments
 (0)