Merge pull request #204 from alliepiper/summaries

alliepiper · web-flow · commit 36adf3a210ba · 2025-04-08T17:51:36.000-04:00
Add min/max timings, new "summaries" example.
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ various NVBench features and usecases:
 - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
 - [Benchmarking on a specific stream](examples/stream.cu)
+- [Adding / hiding columns (summaries) in markdown output](examples/summaries.cu)
 - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
 - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -8,6 +8,7 @@ set(example_srcs
   exec_tag_timer.cu
   skip.cu
   stream.cu
+  summaries.cu
   throughput.cu
 )
 
diff --git a/examples/summaries.cu b/examples/summaries.cu
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// #define PRINT_DEFAULT_SUMMARY_TAGS
+
+void summary_example(nvbench::state &state)
+{
+  // Fetch parameters and compute duration in seconds:
+  const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto duration = ms * 1e-3 + us * 1e-6;
+
+  // Add a new column to the summary table with the derived duration used by the benchmark.
+  // See the documentation in nvbench/summary.cuh for more details.
+  {
+    nvbench::summary &summary = state.add_summary("duration");
+    summary.set_string("name", "Duration (s)");
+    summary.set_string("description", "The duration of the kernel execution.");
+    summary.set_string("hint", "duration");
+    summary.set_float64("value", duration);
+  }
+
+  // Run the measurements:
+  state.exec([duration](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
+  });
+
+#ifdef PRINT_DEFAULT_SUMMARY_TAGS
+  // The default summary tags can be found by inspecting the state after calling
+  // state.exec.
+  // They can also be found by looking at the json output (--json <filename>)
+  for (const auto &summary : state.get_summaries())
+  {
+    std::cout << summary.get_tag() << std::endl;
+  }
+#endif
+
+  // Default summary columns can be shown/hidden in the markdown output tables by adding/removing
+  // the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
+  // means.
+  state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
+  state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
+  state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
+  state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
+  state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
+}
+NVBENCH_BENCH(summary_example)
+  .add_int64_axis("ms", nvbench::range(10, 50, 20))
+  .add_int64_axis("us", nvbench::range(100, 500, 200));
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
@@ -25,6 +25,9 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 
+#include <algorithm>
+#include <limits>
+
 #include <fmt/format.h>
 
 namespace nvbench::detail
@@ -64,13 +67,17 @@ void measure_cold_base::check()
 
 void measure_cold_base::initialize()
 {
-  m_total_cuda_time = 0.;
-  m_total_cpu_time  = 0.;
-  m_cpu_noise       = 0.;
-  m_total_samples   = 0;
+  m_min_cuda_time     = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cuda_time     = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cuda_time   = 0.;
+  m_min_cpu_time      = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cpu_time      = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cpu_time    = 0.;
+  m_total_samples     = 0;
+  m_max_time_exceeded = false;
+
   m_cuda_times.clear();
   m_cpu_times.clear();
-  m_max_time_exceeded = false;
 
   m_stopping_criterion.initialize(m_criterion_params);
 }
@@ -82,10 +89,17 @@ void measure_cold_base::record_measurements()
   // Update and record timers and counters:
   const auto cur_cuda_time = m_cuda_timer.get_duration();
   const auto cur_cpu_time  = m_cpu_timer.get_duration();
-  m_cuda_times.push_back(cur_cuda_time);
-  m_cpu_times.push_back(cur_cpu_time);
+
+  m_min_cuda_time = std::min(m_min_cuda_time, cur_cuda_time);
+  m_max_cuda_time = std::max(m_max_cuda_time, cur_cuda_time);
   m_total_cuda_time += cur_cuda_time;
+  m_cuda_times.push_back(cur_cuda_time);
+
+  m_min_cpu_time = std::min(m_min_cpu_time, cur_cpu_time);
+  m_max_cpu_time = std::max(m_max_cpu_time, cur_cpu_time);
   m_total_cpu_time += cur_cpu_time;
+  m_cpu_times.push_back(cur_cpu_time);
+
   ++m_total_samples;
 
   m_stopping_criterion.add_measurement(cur_cuda_time);
@@ -118,21 +132,10 @@ bool measure_cold_base::is_finished()
   return false;
 }
 
-void measure_cold_base::run_trials_epilogue()
-{
-  // Only need to compute this at the end, not per iteration.
-  const auto cpu_mean  = m_total_cpu_time / static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
-                                                                         m_cpu_times.cend(),
-                                                                         cpu_mean);
-  m_cpu_noise          = cpu_stdev / cpu_mean;
-
-  m_walltime_timer.stop();
-}
+void measure_cold_base::run_trials_epilogue() { m_walltime_timer.stop(); }
 
 void measure_cold_base::generate_summaries()
 {
-  const auto d_samples = static_cast<double>(m_total_samples);
   {
     auto &summ = m_state.add_summary("nv/cold/sample_size");
     summ.set_string("name", "Samples");
@@ -141,51 +144,113 @@ void measure_cold_base::generate_summaries()
     summ.set_int64("value", m_total_samples);
   }
 
-  const auto avg_cpu_time = m_total_cpu_time / d_samples;
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/min");
+    summ.set_string("name", "Min CPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Fastest isolated kernel execution time "
+                    "(measured on host CPU)");
+    summ.set_float64("value", m_min_cpu_time);
+    summ.set_string("hide", "Hidden by default.");
+  }
+
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/max");
+    summ.set_string("name", "Max CPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Slowest isolated kernel execution time "
+                    "(measured on host CPU)");
+    summ.set_float64("value", m_max_cpu_time);
+    summ.set_string("hide", "Hidden by default.");
+  }
+
+  const auto d_samples = static_cast<double>(m_total_samples);
+  const auto cpu_mean  = m_total_cpu_time / d_samples;
   {
     auto &summ = m_state.add_summary("nv/cold/time/cpu/mean");
     summ.set_string("name", "CPU Time");
     summ.set_string("hint", "duration");
     summ.set_string("description",
                     "Mean isolated kernel execution time "
                     "(measured on host CPU)");
-    summ.set_float64("value", avg_cpu_time);
+    summ.set_float64("value", cpu_mean);
+  }
+
+  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                                         m_cpu_times.cend(),
+                                                                         cpu_mean);
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute");
+    summ.set_string("name", "Noise");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description", "Relative standard deviation of isolated CPU times");
+    summ.set_float64("value", cpu_stdev);
+    summ.set_string("hide", "Hidden by default.");
   }
 
+  const auto cpu_noise = cpu_stdev / cpu_mean;
   {
     auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
     summ.set_string("description", "Relative standard deviation of isolated CPU times");
-    summ.set_float64("value", m_cpu_noise);
+    summ.set_float64("value", cpu_noise);
+  }
+
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/min");
+    summ.set_string("name", "Min GPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Fastest isolated kernel execution time "
+                    "(measured with CUDA events)");
+    summ.set_float64("value", m_min_cuda_time);
+    summ.set_string("hide", "Hidden by default.");
+  }
+
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/max");
+    summ.set_string("name", "Max GPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Slowest isolated kernel execution time "
+                    "(measured with CUDA events)");
+    summ.set_float64("value", m_max_cuda_time);
+    summ.set_string("hide", "Hidden by default.");
   }
 
-  const auto avg_cuda_time = m_total_cuda_time / d_samples;
+  const auto cuda_mean = m_total_cuda_time / d_samples;
   {
     auto &summ = m_state.add_summary("nv/cold/time/gpu/mean");
     summ.set_string("name", "GPU Time");
     summ.set_string("hint", "duration");
     summ.set_string("description",
                     "Mean isolated kernel execution time "
                     "(measured with CUDA events)");
-    summ.set_float64("value", avg_cuda_time);
+    summ.set_float64("value", cuda_mean);
   }
 
-  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+  const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
                                                                           m_cuda_times.cend(),
-                                                                          mean_cuda_time);
-  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
-  const auto noise = cuda_rel_stdev;
-  const auto max_noise = m_criterion_params.get_float64("max-noise");
-  const auto min_time = m_criterion_params.get_float64("min-time");
+                                                                          cuda_mean);
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute");
+    summ.set_string("name", "Noise");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description", "Relative standard deviation of isolated GPU times");
+    summ.set_float64("value", cuda_stdev);
+    summ.set_string("hide", "Hidden by default.");
+  }
 
+  const auto cuda_noise = cuda_stdev / cuda_mean;
   {
     auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
     summ.set_string("description", "Relative standard deviation of isolated GPU times");
-    summ.set_float64("value", noise);
+    summ.set_float64("value", cuda_noise);
   }
 
   if (const auto items = m_state.get_element_count(); items != 0)
@@ -194,12 +259,12 @@ void measure_cold_base::generate_summaries()
     summ.set_string("name", "Elem/s");
     summ.set_string("hint", "item_rate");
     summ.set_string("description", "Number of input elements processed per second");
-    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
+    summ.set_float64("value", static_cast<double>(items) / cuda_mean);
   }
 
   if (const auto bytes = m_state.get_global_memory_rw_bytes(); bytes != 0)
   {
-    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / cuda_mean;
     {
       auto &summ = m_state.add_summary("nv/cold/bw/global/bytes_per_second");
       summ.set_string("name", "GlobalMem BW");
@@ -240,16 +305,18 @@ void measure_cold_base::generate_summaries()
 
     if (m_max_time_exceeded)
     {
-      const auto timeout = m_walltime_timer.get_duration();
+      const auto timeout   = m_walltime_timer.get_duration();
+      const auto max_noise = m_criterion_params.get_float64("max-noise");
+      const auto min_time  = m_criterion_params.get_float64("min-time");
 
-      if (noise > max_noise)
+      if (cuda_noise > max_noise)
       {
         printer.log(nvbench::log_level::warn,
                     fmt::format("Current measurement timed out ({:0.2f}s) "
                                 "while over noise threshold ({:0.2f}% > "
                                 "{:0.2f}%)",
                                 timeout,
-                                noise * 100,
+                                cuda_noise * 100,
                                 max_noise * 100));
       }
       if (m_total_samples < m_min_samples)
@@ -277,8 +344,8 @@ void measure_cold_base::generate_summaries()
     printer.log(nvbench::log_level::pass,
                 fmt::format("Cold: {:0.6f}ms GPU, {:0.6f}ms CPU, {:0.2f}s "
                             "total GPU, {:0.2f}s total wall, {}x ",
-                            avg_cuda_time * 1e3,
-                            avg_cpu_time * 1e3,
+                            cuda_mean * 1e3,
+                            cpu_mean * 1e3,
                             m_total_cuda_time,
                             m_walltime_timer.get_duration(),
                             m_total_samples));
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
@@ -98,9 +98,14 @@ protected:
   nvbench::float64_t m_timeout{};
 
   nvbench::int64_t m_total_samples{};
+
+  nvbench::float64_t m_min_cuda_time{};
+  nvbench::float64_t m_max_cuda_time{};
   nvbench::float64_t m_total_cuda_time{};
+
+  nvbench::float64_t m_min_cpu_time{};
+  nvbench::float64_t m_max_cpu_time{};
   nvbench::float64_t m_total_cpu_time{};
-  nvbench::float64_t m_cpu_noise{}; // rel stdev
 
   std::vector<nvbench::float64_t> m_cuda_times;
   std::vector<nvbench::float64_t> m_cpu_times;
diff --git a/nvbench/detail/measure_cpu_only.cuh b/nvbench/detail/measure_cpu_only.cuh
@@ -76,8 +76,10 @@ protected:
   nvbench::float64_t m_timeout{};
 
   nvbench::int64_t m_total_samples{};
+
+  nvbench::float64_t m_min_cpu_time{};
+  nvbench::float64_t m_max_cpu_time{};
   nvbench::float64_t m_total_cpu_time{};
-  nvbench::float64_t m_cpu_noise{}; // rel stdev
 
   std::vector<nvbench::float64_t> m_cpu_times;
 
diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ set(example_srcs`
`8`	`8`	`exec_tag_timer.cu`
`9`	`9`	`skip.cu`
`10`	`10`	`stream.cu`
	`11`	`+ summaries.cu`
`11`	`12`	`throughput.cu`
`12`	`13`	`)`
`13`	`14`