Skip to content

Commit 19e9fc5

Browse files
committed
WIP CPU-only
1 parent 1efed5f commit 19e9fc5

14 files changed

+673
-68
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ set(example_srcs
22
auto_throughput.cu
33
axes.cu
44
custom_criterion.cu
5+
cpu_only.cu
56
enums.cu
67
exec_tag_sync.cu
78
exec_tag_timer.cu

examples/cpu_only.cu

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Copyright 2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#include <nvbench/nvbench.cuh>
20+
21+
#include <chrono>
22+
#include <thread>
23+
24+
// Block execution of the current CPU thread for `seconds` seconds.
25+
void sleep_host(double seconds)
26+
{
27+
std::this_thread::sleep_for(
28+
std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
29+
}
30+
31+
//=============================================================================
32+
// Simple CPU-only benchmark that sleeps on host for a specified duration.
33+
void simple(nvbench::state &state)
34+
{
35+
const auto duration = state.get_float64("Duration");
36+
37+
state.exec([duration](nvbench::launch &) { sleep_host(duration); });
38+
}
39+
NVBENCH_BENCH(simple)
40+
// 100 -> 500 ms in 100 ms increments.
41+
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
42+
// Mark as CPU-only.
43+
.set_is_cpu_only(true);
44+
45+
//=============================================================================
46+
// Simple CPU-only benchmark that sleeps on host for a specified duration and
47+
// uses a custom timed region.
48+
void simple_timer(nvbench::state &state)
49+
{
50+
const auto duration = state.get_float64("Duration");
51+
52+
state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
53+
// Do any setup work before starting the timer here...
54+
timer.start();
55+
56+
// The region of code to be timed:
57+
sleep_host(duration);
58+
59+
timer.stop();
60+
// Any per-run cleanup here...
61+
});
62+
}
63+
NVBENCH_BENCH(simple_timer)
64+
// 100 -> 500 ms in 100 ms increments.
65+
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
66+
// Mark as CPU-only.
67+
.set_is_cpu_only(true);
68+
69+
//=============================================================================
70+
// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
71+
// hint to prevent GPU measurement code from being instantiated. Note that
72+
// `set_is_cpu_only(true)` is still required when using this hint.
73+
void simple_no_gpu(nvbench::state &state)
74+
{
75+
const auto duration = state.get_float64("Duration");
76+
77+
state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
78+
}
79+
NVBENCH_BENCH(simple_no_gpu)
80+
// 100 -> 500 ms in 100 ms increments.
81+
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
82+
// Mark as CPU-only.
83+
.set_is_cpu_only(true);

nvbench/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(srcs
2525

2626
detail/entropy_criterion.cxx
2727
detail/measure_cold.cu
28+
detail/measure_cpu_only.cxx
2829
detail/measure_hot.cu
2930
detail/state_generator.cxx
3031
detail/stdrel_criterion.cxx

nvbench/benchmark_base.cuh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,16 @@ struct benchmark_base
159159
}
160160
/// @}
161161

162+
/// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
163+
/// @{
164+
[[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
165+
benchmark_base &set_is_cpu_only(bool is_cpu_only)
166+
{
167+
m_is_cpu_only = is_cpu_only;
168+
return *this;
169+
}
170+
/// @}
171+
162172
/// If true, the benchmark is only run once, skipping all warmup runs and only
163173
/// executing a single non-batched measurement. This is intended for use with
164174
/// external profiling tools. @{
@@ -263,6 +273,7 @@ protected:
263273

264274
optional_ref<nvbench::printer_base> m_printer;
265275

276+
bool m_is_cpu_only{false};
266277
bool m_run_once{false};
267278
bool m_disable_blocking_kernel{false};
268279

nvbench/benchmark_base.cxx

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,18 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
3434
result->m_axes = m_axes;
3535
result->m_devices = m_devices;
3636

37-
result->m_min_samples = m_min_samples;
38-
result->m_criterion_params = m_criterion_params;
37+
result->m_printer = m_printer;
38+
39+
result->m_is_cpu_only = m_is_cpu_only;
40+
result->m_run_once = m_run_once;
41+
result->m_disable_blocking_kernel = m_disable_blocking_kernel;
42+
43+
result->m_min_samples = m_min_samples;
3944

4045
result->m_skip_time = m_skip_time;
4146
result->m_timeout = m_timeout;
4247

48+
result->m_criterion_params = m_criterion_params;
4349
result->m_stopping_criterion = m_stopping_criterion;
4450

4551
return result;

nvbench/benchmark_manager.cxx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ void benchmark_manager::initialize()
4040
const auto& mgr = device_manager::get();
4141
for (auto& bench : m_benchmarks)
4242
{
43-
bench->set_devices(mgr.get_devices());
43+
if (!bench->get_is_cpu_only())
44+
{
45+
bench->set_devices(mgr.get_devices());
46+
}
4447
}
4548
}
4649

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Copyright 2021-2025 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#pragma once
20+
21+
#include <nvbench/cpu_timer.cuh>
22+
#include <nvbench/exec_tag.cuh>
23+
#include <nvbench/launch.cuh>
24+
#include <nvbench/stopping_criterion.cuh>
25+
26+
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
27+
#include <nvbench/detail/statistics.cuh>
28+
29+
#include <utility>
30+
#include <vector>
31+
32+
namespace nvbench
33+
{
34+
35+
struct state;
36+
37+
namespace detail
38+
{
39+
40+
// non-templated code goes here:
41+
struct measure_cpu_only_base
42+
{
43+
explicit measure_cpu_only_base(nvbench::state &exec_state);
44+
measure_cpu_only_base(const measure_cpu_only_base &) = delete;
45+
measure_cpu_only_base(measure_cpu_only_base &&) = delete;
46+
measure_cpu_only_base &operator=(const measure_cpu_only_base &) = delete;
47+
measure_cpu_only_base &operator=(measure_cpu_only_base &&) = delete;
48+
49+
protected:
50+
void check();
51+
void initialize();
52+
void run_trials_prologue();
53+
void record_measurements();
54+
bool is_finished();
55+
void run_trials_epilogue();
56+
void generate_summaries();
57+
58+
void check_skip_time(nvbench::float64_t warmup_time);
59+
60+
nvbench::state &m_state;
61+
62+
// Required to satisfy the KernelLauncher interface:
63+
nvbench::launch m_launch;
64+
65+
nvbench::cpu_timer m_cpu_timer;
66+
nvbench::cpu_timer m_walltime_timer;
67+
68+
nvbench::criterion_params m_criterion_params;
69+
nvbench::stopping_criterion_base& m_stopping_criterion;
70+
71+
bool m_run_once{false};
72+
73+
nvbench::int64_t m_min_samples{};
74+
75+
nvbench::float64_t m_skip_time{};
76+
nvbench::float64_t m_timeout{};
77+
78+
nvbench::int64_t m_total_samples{};
79+
nvbench::float64_t m_total_cpu_time{};
80+
nvbench::float64_t m_cpu_noise{}; // rel stdev
81+
82+
std::vector<nvbench::float64_t> m_cpu_times;
83+
84+
bool m_max_time_exceeded{};
85+
};
86+
87+
template <typename KernelLauncher>
88+
struct measure_cpu_only : public measure_cpu_only_base
89+
{
90+
measure_cpu_only(nvbench::state &state, KernelLauncher &kernel_launcher)
91+
: measure_cpu_only_base(state)
92+
, m_kernel_launcher{kernel_launcher}
93+
{}
94+
95+
void operator()()
96+
{
97+
this->check();
98+
this->initialize();
99+
this->run_warmup();
100+
101+
this->run_trials_prologue();
102+
this->run_trials();
103+
this->run_trials_epilogue();
104+
105+
this->generate_summaries();
106+
}
107+
108+
private:
109+
// Run the kernel once, measuring the CPU time. If under skip_time, skip the
110+
// measurement.
111+
void run_warmup()
112+
{
113+
if (m_run_once)
114+
{ // Skip warmups
115+
return;
116+
}
117+
118+
this->launch_kernel(m_cpu_timer);
119+
this->check_skip_time(m_cpu_timer.get_duration());
120+
}
121+
122+
void run_trials()
123+
{
124+
do
125+
{
126+
this->launch_kernel(m_cpu_timer);
127+
this->record_measurements();
128+
} while (!this->is_finished());
129+
}
130+
131+
template <typename TimerT>
132+
__forceinline__ void launch_kernel(TimerT &timer)
133+
{
134+
m_kernel_launcher(m_launch, timer);
135+
}
136+
137+
KernelLauncher &m_kernel_launcher;
138+
};
139+
140+
} // namespace detail
141+
} // namespace nvbench

0 commit comments

Comments
 (0)