NVIDIA
diff --git a/‎docs/cli_help.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/cli_help.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎nvbench/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎nvbench/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nvbench/benchmark_base.cuh‎
Lines changed: 26 additions & 3 deletions b/‎nvbench/benchmark_base.cuh‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎nvbench/benchmark_base.cxx‎
Lines changed: 4 additions & 2 deletions b/‎nvbench/benchmark_base.cxx‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎nvbench/detail/gpu_frequency.cuh‎
Lines changed: 53 additions & 0 deletions b/‎nvbench/detail/gpu_frequency.cuh‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎nvbench/detail/gpu_frequency.cxx‎
Lines changed: 47 additions & 0 deletions b/‎nvbench/detail/gpu_frequency.cxx‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎nvbench/detail/measure_cold.cu‎
Lines changed: 52 additions & 0 deletions b/‎nvbench/detail/measure_cold.cu‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎nvbench/detail/measure_cold.cuh‎
Lines changed: 24 additions & 7 deletions b/‎nvbench/detail/measure_cold.cuh‎
Lines changed: 24 additions & 7 deletions
@@ -133,6 +133,18 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
+* `--throttle-threshold <value>`
+  * Set the GPU throttle threshold as percentage of the peak clock rate.
+  * Default is 75%.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--throttle-recovery-delay <value>`
+  * Set the GPU throttle recovery delay in seconds.
+  * Default is 0.05 seconds.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 * `--run-once`
   * Only run the benchmark once, skipping any warmup runs and batched
     measurements.
 
@@ -29,6 +29,8 @@ set(srcs
   detail/measure_hot.cu
   detail/state_generator.cxx
   detail/stdrel_criterion.cxx
+  detail/gpu_frequency.cxx
+  detail/timestamps_kernel.cu
 
   internal/nvml.cxx
 )
 
@@ -247,12 +247,32 @@ struct benchmark_base
   }
   /// @}
 
-  [[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
-  [[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
+  [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
+
+  void set_throttle_threshold(nvbench::float32_t throttle_threshold)
+  {
+    m_throttle_threshold = throttle_threshold;
+  }
+
+  [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
+  {
+    return m_throttle_recovery_delay;
+  }
+
+  void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
+  {
+    m_throttle_recovery_delay = throttle_recovery_delay;
+  }
+
+  [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
+  [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
+  {
+    return m_criterion_params;
+  }
 
   /// Control the stopping criterion for the measurement loop.
   /// @{
-  [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
+  [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
   benchmark_base &set_stopping_criterion(std::string criterion)
   {
     m_stopping_criterion = std::move(criterion);
@@ -282,6 +302,9 @@ protected:
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
+  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
+
   nvbench::criterion_params m_criterion_params;
   std::string m_stopping_criterion{"stdrel"};
 
 
@@ -17,7 +17,6 @@
  */
 
 #include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/detail/transform_reduce.cuh>
 
 namespace nvbench
@@ -45,7 +44,10 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
 
-  result->m_criterion_params   = m_criterion_params;
+  result->m_criterion_params        = m_criterion_params;
+  result->m_throttle_threshold      = m_throttle_threshold;
+  result->m_throttle_recovery_delay = m_throttle_recovery_delay;
+
   result->m_stopping_criterion = m_stopping_criterion;
 
   return result;
 
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/detail/timestamps_kernel.cuh>
+#include <nvbench/types.cuh>
+
+namespace nvbench::detail
+{
+
+struct cuda_stream;
+
+struct gpu_frequency
+{
+  gpu_frequency() = default;
+
+  // move-only
+  gpu_frequency(const gpu_frequency &)            = delete;
+  gpu_frequency(gpu_frequency &&)                 = default;
+  gpu_frequency &operator=(const gpu_frequency &) = delete;
+  gpu_frequency &operator=(gpu_frequency &&)      = default;
+
+  void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
+
+  void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
+
+  [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+                                   nvbench::float32_t throttle_threshold);
+
+  [[nodiscard]] nvbench::float32_t get_clock_frequency();
+
+private:
+  nvbench::detail::timestamps_kernel m_start;
+  nvbench::detail::timestamps_kernel m_stop;
+};
+
+} // namespace nvbench::detail
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/gpu_frequency.cuh>
+
+#include <iostream>
+
+namespace nvbench::detail
+{
+
+nvbench::float32_t gpu_frequency::get_clock_frequency()
+{
+  nvbench::uint64_t elapsed_ns     = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
+  nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
+  nvbench::float32_t clock_rate    = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
+  return clock_rate;
+}
+
+bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+                                  nvbench::float32_t throttle_threshold)
+{
+  float threshold = peak_sm_clock_rate_hz * throttle_threshold;
+
+  if (this->get_clock_frequency() < threshold)
+  {
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace nvbench::detail
@@ -26,7 +26,9 @@
 #include <nvbench/summary.cuh>
 
 #include <algorithm>
+#include <chrono>
 #include <limits>
+#include <thread>
 
 #include <fmt/format.h>
 
@@ -44,11 +46,14 @@ measure_cold_base::measure_cold_base(state &exec_state)
     , m_min_samples{exec_state.get_min_samples()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
+    , m_throttle_threshold(exec_state.get_throttle_threshold())
+    , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
 {
   if (m_min_samples > 0)
   {
     m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
     m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
+    m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
   }
 }
 
@@ -78,6 +83,7 @@ void measure_cold_base::initialize()
 
   m_cuda_times.clear();
   m_cpu_times.clear();
+  m_sm_clock_rates.clear();
 
   m_stopping_criterion.initialize(m_criterion_params);
 }
@@ -86,6 +92,40 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
 
 void measure_cold_base::record_measurements()
 {
+  if (!m_run_once)
+  {
+    auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
+
+    if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
+    {
+      if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
+      {
+        auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
+        auto &printer           = printer_opt_ref.value().get();
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
+                                "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
+                                "and pausing for {}s.",
+                                current_clock_rate / 1000000.0f,
+                                peak_clock_rate / 1000000.0f,
+                                100.0f * (current_clock_rate / peak_clock_rate),
+                                100.0f * m_throttle_threshold,
+                                m_total_samples,
+                                m_throttle_recovery_delay));
+      }
+
+      if (m_throttle_recovery_delay > 0.0f)
+      { // let the GPU cool down
+        std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
+      }
+
+      // ignore this measurement
+      return;
+    }
+
+    m_sm_clock_rates.push_back(peak_clock_rate);
+  }
+
   // Update and record timers and counters:
   const auto cur_cuda_time = m_cuda_timer.get_duration();
   const auto cur_cpu_time  = m_cpu_timer.get_duration();
@@ -298,6 +338,18 @@ void measure_cold_base::generate_summaries()
     summ.set_string("hide", "Hidden by default.");
   }
 
+  if (!m_sm_clock_rates.empty())
+  {
+    auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
+    summ.set_string("name", "Clock Rate");
+    summ.set_string("hint", "frequency");
+    summ.set_string("description", "Mean SM clock rate");
+    summ.set_string("hide", "Hidden by default.");
+    summ.set_float64("value",
+                     nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
+                                                               m_sm_clock_rates.cend()));
+  }
+
   // Log if a printer exists:
   if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
 
@@ -18,24 +18,26 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
+
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
+#include <nvbench/detail/gpu_frequency.cuh>
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/statistics.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
 
-#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
-#include <nvbench/detail/l2flush.cuh>
-#include <nvbench/detail/statistics.cuh>
-
-#include <cuda_runtime.h>
-
 #include <utility>
 #include <vector>
 
+#include "nvbench/types.cuh"
+
 namespace nvbench
 {
 
@@ -64,6 +66,8 @@ protected:
   bool is_finished();
   void run_trials_epilogue();
   void generate_summaries();
+  void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
+  void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
 
   void check_skip_time(nvbench::float64_t warmup_time);
 
@@ -87,7 +91,8 @@ protected:
   nvbench::blocking_kernel m_blocker;
 
   nvbench::criterion_params m_criterion_params;
-  nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::stopping_criterion_base &m_stopping_criterion;
+  nvbench::detail::gpu_frequency m_gpu_frequency;
 
   bool m_disable_blocking_kernel{false};
   bool m_run_once{false};
@@ -97,6 +102,9 @@ protected:
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
+  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay; // [seconds]
+
   nvbench::int64_t m_total_samples{};
 
   nvbench::float64_t m_min_cuda_time{};
@@ -109,6 +117,7 @@ protected:
 
   std::vector<nvbench::float64_t> m_cuda_times;
   std::vector<nvbench::float64_t> m_cpu_times;
+  std::vector<nvbench::float32_t> m_sm_clock_rates;
 
   bool m_max_time_exceeded{};
 };
@@ -128,6 +137,10 @@ struct measure_cold_base::kernel_launch_timer
     {
       m_measure.block_stream();
     }
+    if (!m_measure.m_run_once)
+    {
+      m_measure.gpu_frequency_start();
+    }
     m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
     if (m_disable_blocking_kernel)
     {
@@ -143,6 +156,10 @@ struct measure_cold_base::kernel_launch_timer
       m_measure.m_cpu_timer.start();
       m_measure.unblock_stream();
     }
+    if (!m_measure.m_run_once)
+    {
+      m_measure.gpu_frequency_stop();
+    }
     m_measure.sync_stream();
     m_measure.m_cpu_timer.stop();
   }
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,8 @@ set(srcs`
`29`	`29`	`detail/measure_hot.cu`
`30`	`30`	`detail/state_generator.cxx`
`31`	`31`	`detail/stdrel_criterion.cxx`
	`32`	`+ detail/gpu_frequency.cxx`
	`33`	`+ detail/timestamps_kernel.cu`
`32`	`34`
`33`	`35`	`internal/nvml.cxx`
`34`	`36`	`)`