NVIDIA
diff --git a/‎README.md‎
Lines changed: 7 additions & 2 deletions b/‎README.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎docs/benchmarks.md‎
Lines changed: 93 additions & 12 deletions b/‎docs/benchmarks.md‎
Lines changed: 93 additions & 12 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/cpu_only.cu‎
Lines changed: 83 additions & 0 deletions b/‎examples/cpu_only.cu‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎nvbench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎nvbench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nvbench/benchmark_base.cuh‎
Lines changed: 11 additions & 0 deletions b/‎nvbench/benchmark_base.cuh‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎nvbench/benchmark_base.cxx‎
Lines changed: 8 additions & 2 deletions b/‎nvbench/benchmark_base.cxx‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎nvbench/benchmark_manager.cxx‎
Lines changed: 4 additions & 1 deletion b/‎nvbench/benchmark_manager.cxx‎
Lines changed: 4 additions & 1 deletion
@@ -25,6 +25,9 @@ features:
   * Batch Measurements:
     * Executes the benchmark multiple times back-to-back and records total time.
     * Reports the average execution time (total time / number of executions).
+  * [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks)
+    * Measures the host-side execution time of a non-GPU benchmark.
+    * Not suitable for microbenchmarking.
 
 # Supported Compilers and Tools
 
@@ -65,6 +68,7 @@ This repository provides a number of [examples](examples/) that demonstrate
 various NVBench features and usecases:
 
 - [Runtime and compile-time parameter sweeps](examples/axes.cu)
+- [CPU-only benchmarking](examples/cpu_only.cu)
 - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
 - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
@@ -171,6 +175,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of
 end-to-end performance of multiple applications, the NVIDIA Nsight tools are
 more appropriate.
 
-NVBench is focused on evaluating the performance of CUDA kernels and is not
-optimized for CPU microbenchmarks. This may change in the future, but for now,
+NVBench is focused on evaluating the performance of CUDA kernels. It also provides
+CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is
+not optimized for CPU microbenchmarks. This may change in the future, but for now,
 consider using Google Benchmark for high resolution CPU benchmarks.
@@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
 
 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
     my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
   });
 }
@@ -97,7 +97,7 @@ void benchmark(nvbench::state& state)
   const auto num_inputs = state.get_int64("NumInputs");
   thrust::device_vector<int> data = generate_input(num_inputs);
 
-  state.exec([&data](nvbench::launch& launch) { 
+  state.exec([&data](nvbench::launch& launch) {
     my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
   });
 }
@@ -134,7 +134,7 @@ void benchmark(nvbench::state& state)
   const auto quality = state.get_float64("Quality");
 
   state.exec([&quality](nvbench::launch& launch)
-  { 
+  {
     my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
   });
 }
@@ -153,7 +153,7 @@ void benchmark(nvbench::state& state)
   thrust::device_vector<int> data = generate_input(rng_dist);
 
   state.exec([&data](nvbench::launch& launch)
-  { 
+  {
     my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
   });
 }
@@ -182,7 +182,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
   thrust::device_vector<T> data = generate_input<T>();
 
   state.exec([&data](nvbench::launch& launch)
-  { 
+  {
     my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
   });
 }
@@ -266,7 +266,6 @@ In general::
 
 More examples can found in [examples/throughput.cu](../examples/throughput.cu).
 
-
 # Skip Uninteresting / Invalid Benchmarks
 
 Sometimes particular combinations of parameters aren't useful or interesting —
@@ -294,7 +293,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
 // Skip benchmarks at compile time -- for example, always skip when T == U
 // (Note that the `type_list` argument defines the same type twice).
 template <typename SameType>
-void my_benchmark(nvbench::state& state, 
+void my_benchmark(nvbench::state& state,
                   nvbench::type_list<SameType, SameType>)
 {
   state.skip("T must not be the same type as U.");
@@ -320,6 +319,15 @@ true:
   synchronize internally.
 - `nvbench::exec_tag::timer` requests a timer object that can be used to
   restrict the timed region.
+- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
+  them during execution to reduce runtime, and prevents their compilation to
+  reduce compile-time and binary size.
+- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark is defined with `set_is_cpu_only(true)`.
+- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark does not also define `set_is_cpu_only(true)`.
 
 Multiple execution tags may be combined using `operator|`, e.g.
 
@@ -370,7 +378,7 @@ Note that using manual timer mode disables batch measurements.
 void timer_example(nvbench::state& state)
 {
   // Pass the `timer` exec tag to request a timer:
-  state.exec(nvbench::exec_tag::timer, 
+  state.exec(nvbench::exec_tag::timer,
     // Lambda now accepts a timer:
     [](nvbench::launch& launch, auto& timer)
     {
@@ -391,6 +399,79 @@ NVBENCH_BENCH(timer_example);
 See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
 example.
 
+## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu`
+
+These execution tags are optional hints that disable the compilation of various
+code paths when they are not needed. They apply only to a single benchmark.
+
+- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
+- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
+  - Requires that the benchmark does not define `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - Host-side CPU measurements of GPU kernel execution time are still provided.
+- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
+  - Requires that the benchmark defines `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - See also [CPU-only Benchmarks](#cpu-only-benchmarks).
+
+# CPU-only Benchmarks
+
+NVBench provides CPU-only benchmarking facilities that are intended for measuring
+significant CPU workloads. We do not recommend using these features for high-resolution
+CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
+such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu).
+
+Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
+CPU-only benchmarks.
+
+The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
+binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
+`is_cpu_only` is false.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
+             [](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
+is useful for benchmarks that require additional per-sample setup/teardown. See the
+[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more
+details.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
+             nvbench::exec_tag::timer,   // Request a timer object
+             [](nvbench::launch &, auto &timer)
+    {
+      // Setup here
+      timer.start();
+      // timed workload
+      timer.stop();
+      // teardown here
+    });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
 # Beware: Combinatorial Explosion Is Lurking
 
 Be very careful of how quickly the configuration space can grow. The following
@@ -403,7 +484,7 @@ using value_types = nvbench::type_list<nvbench::uint8_t,
                                        nvbench::int32_t,
                                        nvbench::float32_t,
                                        nvbench::float64_t>;
-using op_types = nvbench::type_list<thrust::plus<>, 
+using op_types = nvbench::type_list<thrust::plus<>,
                                     thrust::multiplies<>,
                                     thrust::maximum<>>;
 
@@ -418,7 +499,7 @@ NVBENCH_BENCH_TYPES(my_benchmark,
 
 ```
 960 total configs
-= 4 [T=(U8, I32, F32, F64)] 
+= 4 [T=(U8, I32, F32, F64)]
 * 4 [U=(U8, I32, F32, F64)]
 * 4 [V=(U8, I32, F32, F64)]
 * 3 [Op=(plus, multiplies, max)]
@@ -427,8 +508,8 @@ NVBENCH_BENCH_TYPES(my_benchmark,
 
 For large configuration spaces like this, pruning some of the less useful
 combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
-described in the "Skip Uninteresting / Invalid Benchmarks" section can help
-immensely with keeping compile / run times manageable.
+described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks)
+section can help immensely with keeping compile / run times manageable.
 
 Splitting a single large configuration space into multiple, more focused
 benchmarks with reduced dimensionality will likely be worth the effort as well.
@@ -2,6 +2,7 @@ set(example_srcs
   auto_throughput.cu
   axes.cu
   custom_criterion.cu
+  cpu_only.cu
   enums.cu
   exec_tag_sync.cu
   exec_tag_timer.cu
 
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <chrono>
+#include <thread>
+
+// Block execution of the current CPU thread for `seconds` seconds.
+void sleep_host(double seconds)
+{
+  std::this_thread::sleep_for(
+    std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
+}
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration.
+void simple(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec([duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration and
+// uses a custom timed region.
+void simple_timer(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
+    // Do any setup work before starting the timer here...
+    timer.start();
+
+    // The region of code to be timed:
+    sleep_host(duration);
+
+    timer.stop();
+    // Any per-run cleanup here...
+  });
+}
+NVBENCH_BENCH(simple_timer)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
+// hint to prevent GPU measurement code from being instantiated. Note that
+// `set_is_cpu_only(true)` is still required when using this hint.
+void simple_no_gpu(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple_no_gpu)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
@@ -25,6 +25,7 @@ set(srcs
 
   detail/entropy_criterion.cxx
   detail/measure_cold.cu
+  detail/measure_cpu_only.cxx
   detail/measure_hot.cu
   detail/state_generator.cxx
   detail/stdrel_criterion.cxx
 
@@ -159,6 +159,16 @@ struct benchmark_base
   }
   /// @}
 
+  /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
+  /// @{
+  [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
+  benchmark_base &set_is_cpu_only(bool is_cpu_only)
+  {
+    m_is_cpu_only = is_cpu_only;
+    return *this;
+  }
+  /// @}
+
   /// If true, the benchmark is only run once, skipping all warmup runs and only
   /// executing a single non-batched measurement. This is intended for use with
   /// external profiling tools. @{
@@ -263,6 +273,7 @@ protected:
 
   optional_ref<nvbench::printer_base> m_printer;
 
+  bool m_is_cpu_only{false};
   bool m_run_once{false};
   bool m_disable_blocking_kernel{false};
 
 
@@ -34,12 +34,18 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_axes    = m_axes;
   result->m_devices = m_devices;
 
-  result->m_min_samples      = m_min_samples;
-  result->m_criterion_params = m_criterion_params;
+  result->m_printer = m_printer;
+
+  result->m_is_cpu_only             = m_is_cpu_only;
+  result->m_run_once                = m_run_once;
+  result->m_disable_blocking_kernel = m_disable_blocking_kernel;
+
+  result->m_min_samples = m_min_samples;
 
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
 
+  result->m_criterion_params   = m_criterion_params;
   result->m_stopping_criterion = m_stopping_criterion;
 
   return result;
 
@@ -40,7 +40,10 @@ void benchmark_manager::initialize()
   const auto& mgr = device_manager::get();
   for (auto& bench : m_benchmarks)
   {
-    bench->set_devices(mgr.get_devices());
+    if (!bench->get_is_cpu_only())
+    {
+      bench->set_devices(mgr.get_devices());
+    }
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,10 @@ void benchmark_manager::initialize()`
`40`	`40`	`const auto& mgr = device_manager::get();`
`41`	`41`	`for (auto& bench : m_benchmarks)`
`42`	`42`	`{`
`43`		`- bench->set_devices(mgr.get_devices());`
	`43`	`+ if (!bench->get_is_cpu_only())`
	`44`	`+ {`
	`45`	`+ bench->set_devices(mgr.get_devices());`
	`46`	`+ }`
`44`	`47`	`}`
`45`	`48`	`}`
`46`	`49`