Skip to content

Commit f29f7ac

Browse files
committed
Detect throttle
Signed-off-by: Georgy Evtushenko <[email protected]>
1 parent 36adf3a commit f29f7ac

15 files changed

+435
-33
lines changed

docs/cli_help.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,23 @@
133133
* Applies to the most recent `--benchmark`, or all benchmarks if specified
134134
before any `--benchmark` arguments.
135135

136+
* `--discard-on-throttle`
137+
* Discard measurements if the GPU is throttled.
138+
* Applies to the most recent `--benchmark`, or all benchmarks if specified
139+
before any `--benchmark` arguments.
140+
141+
* `--throttle-threshold <value>`
142+
* Set the GPU throttle threshold as percentage of the peak clock rate.
143+
* Default is 0.75 (75%).
144+
* Applies to the most recent `--benchmark`, or all benchmarks if specified
145+
before any `--benchmark` arguments.
146+
147+
* `--throttle-recovery-delay <value>`
148+
* Set the GPU throttle recovery delay in seconds.
149+
* Default is 0.5 seconds.
150+
* Applies to the most recent `--benchmark`, or all benchmarks if specified
151+
before any `--benchmark` arguments.
152+
136153
* `--run-once`
137154
* Only run the benchmark once, skipping any warmup runs and batched
138155
measurements.

nvbench/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ set(srcs
2929
detail/measure_hot.cu
3030
detail/state_generator.cxx
3131
detail/stdrel_criterion.cxx
32+
detail/gpu_frequency.cxx
33+
detail/timestamps_kernel.cu
3234

3335
internal/nvml.cxx
3436
)

nvbench/benchmark_base.cuh

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,39 @@ struct benchmark_base
247247
}
248248
/// @}
249249

250-
[[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
251-
[[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
250+
[[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
251+
252+
void set_throttle_threshold(nvbench::float32_t throttle_threshold)
253+
{
254+
m_throttle_threshold = throttle_threshold;
255+
}
256+
257+
[[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
258+
{
259+
return m_throttle_recovery_delay;
260+
}
261+
262+
void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
263+
{
264+
m_throttle_recovery_delay = throttle_recovery_delay;
265+
}
266+
267+
[[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
268+
269+
void set_discard_on_throttle(bool discard_on_throttle)
270+
{
271+
m_discard_on_throttle = discard_on_throttle;
272+
}
273+
274+
[[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
275+
[[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
276+
{
277+
return m_criterion_params;
278+
}
252279

253280
/// Control the stopping criterion for the measurement loop.
254281
/// @{
255-
[[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
282+
[[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
256283
benchmark_base &set_stopping_criterion(std::string criterion)
257284
{
258285
m_stopping_criterion = std::move(criterion);
@@ -282,6 +309,10 @@ protected:
282309
nvbench::float64_t m_skip_time{-1.};
283310
nvbench::float64_t m_timeout{15.};
284311

312+
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
313+
nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
314+
bool m_discard_on_throttle{false};
315+
285316
nvbench::criterion_params m_criterion_params;
286317
std::string m_stopping_criterion{"stdrel"};
287318

nvbench/benchmark_base.cxx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
*/
1818

1919
#include <nvbench/benchmark_base.cuh>
20-
2120
#include <nvbench/detail/transform_reduce.cuh>
2221

2322
namespace nvbench
@@ -45,7 +44,11 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
4544
result->m_skip_time = m_skip_time;
4645
result->m_timeout = m_timeout;
4746

48-
result->m_criterion_params = m_criterion_params;
47+
result->m_criterion_params = m_criterion_params;
48+
result->m_throttle_threshold = m_throttle_threshold;
49+
result->m_throttle_recovery_delay = m_throttle_recovery_delay;
50+
result->m_discard_on_throttle = m_discard_on_throttle;
51+
4952
result->m_stopping_criterion = m_stopping_criterion;
5053

5154
return result;

nvbench/detail/gpu_frequency.cuh

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#pragma once
20+
21+
#include <nvbench/detail/timestamps_kernel.cuh>
22+
#include <nvbench/types.cuh>
23+
24+
namespace nvbench::detail
25+
{
26+
27+
struct cuda_stream;
28+
29+
struct gpu_frequency
30+
{
31+
gpu_frequency() = default;
32+
33+
// move-only
34+
gpu_frequency(const gpu_frequency &) = delete;
35+
gpu_frequency(gpu_frequency &&) = default;
36+
gpu_frequency &operator=(const gpu_frequency &) = delete;
37+
gpu_frequency &operator=(gpu_frequency &&) = default;
38+
39+
void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
40+
41+
void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
42+
43+
[[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
44+
nvbench::float32_t throttle_threshold);
45+
46+
[[nodiscard]] nvbench::float32_t get_clock_frequency();
47+
48+
private:
49+
nvbench::detail::timestamps_kernel m_start;
50+
nvbench::detail::timestamps_kernel m_stop;
51+
};
52+
53+
} // namespace nvbench::detail

nvbench/detail/gpu_frequency.cxx

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#include <nvbench/detail/gpu_frequency.cuh>
20+
21+
#include <iostream>
22+
23+
namespace nvbench::detail
24+
{
25+
26+
nvbench::float32_t gpu_frequency::get_clock_frequency()
27+
{
28+
nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
29+
nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
30+
nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
31+
return clock_rate;
32+
}
33+
34+
bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
35+
nvbench::float32_t throttle_threshold)
36+
{
37+
float threshold = peak_sm_clock_rate_hz * throttle_threshold;
38+
39+
if (this->get_clock_frequency() < threshold)
40+
{
41+
return true;
42+
}
43+
44+
return false;
45+
}
46+
47+
} // namespace nvbench::detail

nvbench/detail/measure_cold.cu

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
#include <algorithm>
2929
#include <limits>
30+
#include <chrono>
31+
#include <thread>
3032

3133
#include <fmt/format.h>
3234

@@ -44,6 +46,9 @@ measure_cold_base::measure_cold_base(state &exec_state)
4446
, m_min_samples{exec_state.get_min_samples()}
4547
, m_skip_time{exec_state.get_skip_time()}
4648
, m_timeout{exec_state.get_timeout()}
49+
, m_throttle_threshold(exec_state.get_throttle_threshold())
50+
, m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
51+
, m_discard_on_throttle(exec_state.get_discard_on_throttle())
4752
{
4853
if (m_min_samples > 0)
4954
{
@@ -86,6 +91,41 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
8691

8792
void measure_cold_base::record_measurements()
8893
{
94+
if (!m_run_once)
95+
{
96+
auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
97+
98+
if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
99+
{
100+
if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
101+
{
102+
auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
103+
auto &printer = printer_opt_ref.value().get();
104+
printer.log(nvbench::log_level::warn,
105+
fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
106+
"({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and "
107+
"pausing for {}s.",
108+
current_clock_rate / 1000000.0f,
109+
peak_clock_rate / 1000000.0f,
110+
100.0f * (current_clock_rate / peak_clock_rate),
111+
100.0f * m_throttle_threshold,
112+
m_total_samples,
113+
m_discard_on_throttle ? "Discarding" : "Keeping",
114+
m_throttle_recovery_delay));
115+
}
116+
117+
if (m_throttle_recovery_delay > 0.0f)
118+
{ // let the GPU cool down
119+
std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
120+
}
121+
122+
if (m_discard_on_throttle)
123+
{ // ignore this measurement
124+
return;
125+
}
126+
}
127+
}
128+
89129
// Update and record timers and counters:
90130
const auto cur_cuda_time = m_cuda_timer.get_duration();
91131
const auto cur_cpu_time = m_cpu_timer.get_duration();

nvbench/detail/measure_cold.cuh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
3131
#include <nvbench/detail/l2flush.cuh>
3232
#include <nvbench/detail/statistics.cuh>
33+
#include <nvbench/detail/gpu_frequency.cuh>
3334

3435
#include <cuda_runtime.h>
3536

@@ -64,6 +65,8 @@ protected:
6465
bool is_finished();
6566
void run_trials_epilogue();
6667
void generate_summaries();
68+
void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
69+
void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
6770

6871
void check_skip_time(nvbench::float64_t warmup_time);
6972

@@ -88,6 +91,7 @@ protected:
8891

8992
nvbench::criterion_params m_criterion_params;
9093
nvbench::stopping_criterion_base& m_stopping_criterion;
94+
nvbench::detail::gpu_frequency m_gpu_frequency;
9195

9296
bool m_disable_blocking_kernel{false};
9397
bool m_run_once{false};
@@ -97,6 +101,10 @@ protected:
97101
nvbench::float64_t m_skip_time{};
98102
nvbench::float64_t m_timeout{};
99103

104+
nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate]
105+
nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
106+
bool m_discard_on_throttle{false};
107+
100108
nvbench::int64_t m_total_samples{};
101109

102110
nvbench::float64_t m_min_cuda_time{};
@@ -128,6 +136,10 @@ struct measure_cold_base::kernel_launch_timer
128136
{
129137
m_measure.block_stream();
130138
}
139+
if (!m_measure.m_run_once)
140+
{
141+
m_measure.gpu_frequency_start();
142+
}
131143
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
132144
if (m_disable_blocking_kernel)
133145
{
@@ -143,6 +155,10 @@ struct measure_cold_base::kernel_launch_timer
143155
m_measure.m_cpu_timer.start();
144156
m_measure.unblock_stream();
145157
}
158+
if (!m_measure.m_run_once)
159+
{
160+
m_measure.gpu_frequency_stop();
161+
}
146162
m_measure.sync_stream();
147163
m_measure.m_cpu_timer.stop();
148164
}

0 commit comments

Comments
 (0)