Skip to content

Commit 851d7aa

Browse files
committed
Make blocking kernel use a runtime option.
It's not worth instantiating multiple instances of the measurement class to handle this. Since there's already runtime option to disable the blocking kernel, the current implementation by default will instantiate both the blocking and non-blocking version of the algorithm for dynamic dispatch.
1 parent 52028be commit 851d7aa

File tree

5 files changed

+33
-41
lines changed

5 files changed

+33
-41
lines changed

nvbench/detail/measure_cold.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
3434
: m_state{exec_state}
3535
, m_launch{m_state.get_cuda_stream()}
3636
, m_criterion_params{exec_state.get_criterion_params()}
37-
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
37+
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
38+
exec_state.get_stopping_criterion())}
39+
, m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
3840
, m_run_once{exec_state.get_run_once()}
39-
, m_no_block{exec_state.get_disable_blocking_kernel()}
4041
, m_min_samples{exec_state.get_min_samples()}
4142
, m_skip_time{exec_state.get_skip_time()}
4243
, m_timeout{exec_state.get_timeout()}

nvbench/detail/measure_cold.cuh

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ struct measure_cold_base
5454
measure_cold_base &operator=(measure_cold_base &&) = delete;
5555

5656
protected:
57-
template <bool use_blocking_kernel>
5857
struct kernel_launch_timer;
58+
friend struct kernel_launch_timer;
5959

6060
void check();
6161
void initialize();
@@ -89,8 +89,8 @@ protected:
8989
nvbench::criterion_params m_criterion_params;
9090
nvbench::stopping_criterion_base& m_stopping_criterion;
9191

92+
bool m_disable_blocking_kernel{false};
9293
bool m_run_once{false};
93-
bool m_no_block{false};
9494

9595
nvbench::int64_t m_min_samples{};
9696

@@ -108,23 +108,23 @@ protected:
108108
bool m_max_time_exceeded{};
109109
};
110110

111-
template <bool use_blocking_kernel>
112111
struct measure_cold_base::kernel_launch_timer
113112
{
114113
kernel_launch_timer(measure_cold_base &measure)
115114
: m_measure{measure}
115+
, m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
116116
{}
117117

118118
__forceinline__ void start()
119119
{
120120
m_measure.flush_device_l2();
121121
m_measure.sync_stream();
122-
if constexpr (use_blocking_kernel)
122+
if (!m_disable_blocking_kernel)
123123
{
124124
m_measure.block_stream();
125125
}
126126
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
127-
if constexpr (!use_blocking_kernel)
127+
if (m_disable_blocking_kernel)
128128
{
129129
m_measure.m_cpu_timer.start();
130130
}
@@ -133,7 +133,7 @@ struct measure_cold_base::kernel_launch_timer
133133
__forceinline__ void stop()
134134
{
135135
m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
136-
if constexpr (use_blocking_kernel)
136+
if (!m_disable_blocking_kernel)
137137
{
138138
m_measure.m_cpu_timer.start();
139139
m_measure.unblock_stream();
@@ -144,9 +144,10 @@ struct measure_cold_base::kernel_launch_timer
144144

145145
private:
146146
measure_cold_base &m_measure;
147+
bool m_disable_blocking_kernel;
147148
};
148149

149-
template <typename KernelLauncher, bool use_blocking_kernel>
150+
template <typename KernelLauncher>
150151
struct measure_cold : public measure_cold_base
151152
{
152153
measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -177,15 +178,15 @@ private:
177178
return;
178179
}
179180

180-
kernel_launch_timer<use_blocking_kernel> timer(*this);
181+
kernel_launch_timer timer(*this);
181182

182183
this->launch_kernel(timer);
183184
this->check_skip_time(m_cuda_timer.get_duration());
184185
}
185186

186187
void run_trials()
187188
{
188-
kernel_launch_timer<use_blocking_kernel> timer(*this);
189+
kernel_launch_timer timer(*this);
189190
do
190191
{
191192
this->launch_kernel(timer);

nvbench/detail/measure_hot.cuh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,11 @@ protected:
8080
nvbench::int64_t m_total_samples{};
8181
nvbench::float64_t m_total_cuda_time{};
8282

83+
bool m_disable_blocking_kernel{false};
8384
bool m_max_time_exceeded{false};
8485
};
8586

86-
template <typename KernelLauncher, bool use_blocking_kernel>
87+
template <typename KernelLauncher>
8788
struct measure_hot : public measure_hot_base
8889
{
8990
measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -105,7 +106,7 @@ private:
105106
// measurement.
106107
void run_warmup()
107108
{
108-
if constexpr (use_blocking_kernel)
109+
if (!m_disable_blocking_kernel)
109110
{
110111
this->block_stream();
111112
}
@@ -114,7 +115,7 @@ private:
114115
this->launch_kernel();
115116
m_cuda_timer.stop(m_launch.get_stream());
116117

117-
if constexpr (use_blocking_kernel)
118+
if (!m_disable_blocking_kernel)
118119
{
119120
this->unblock_stream();
120121
}
@@ -137,7 +138,7 @@ private:
137138
{
138139
batch_size = std::max(batch_size, nvbench::int64_t{1});
139140

140-
if constexpr (use_blocking_kernel)
141+
if (!m_disable_blocking_kernel)
141142
{
142143
// Block stream until some work is queued.
143144
// Limit the number of kernel executions while blocked to prevent

nvbench/detail/state_exec.cuh

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,17 +79,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
7979
return;
8080
}
8181

82-
// TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
83-
// and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
84-
// implies `no_block` when refactoring.
85-
if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
86-
this->get_disable_blocking_kernel())
87-
{
88-
constexpr auto no_block_tags = tags | no_block;
89-
this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
90-
return;
91-
}
92-
9382
// If no measurements selected, pick some defaults based on the modifiers:
9483
if constexpr (!measure_tags)
9584
{
@@ -123,6 +112,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
123112
return;
124113
}
125114

115+
// Syncing will cause the blocking kernel pattern to deadlock:
116+
if constexpr (modifier_tags & sync)
117+
{
118+
this->set_disable_blocking_kernel(true);
119+
}
120+
126121
if (this->is_skipped())
127122
{
128123
return;
@@ -157,7 +152,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
157152
{
158153
static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");
159154

160-
constexpr bool use_blocking_kernel = !(tags & no_block);
161155
if constexpr (tags & timer)
162156
{
163157
// Estimate bandwidth here
@@ -173,7 +167,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
173167
}
174168
#endif
175169

176-
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
170+
using measure_t = nvbench::detail::measure_cold<KL>;
177171
measure_t measure{*this, kernel_launcher};
178172
measure();
179173
}
@@ -195,7 +189,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
195189
}
196190
#endif
197191

198-
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
192+
using measure_t = nvbench::detail::measure_cold<wrapper_t>;
199193
measure_t measure(*this, wrapper);
200194
measure();
201195
}
@@ -207,8 +201,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
207201
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
208202
static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
209203
static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
210-
constexpr bool use_blocking_kernel = !(tags & no_block);
211-
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
204+
using measure_t = nvbench::detail::measure_hot<KL>;
212205
measure_t measure{*this, kernel_launcher};
213206
measure();
214207
}

nvbench/exec_tag.cuh

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,11 @@ enum class exec_flag
3232

3333
// Modifiers:
3434
timer = 0x01, // KernelLauncher uses manual timing
35-
no_block = 0x02, // Disables use of `blocking_kernel`.
36-
sync = 0x04, // KernelLauncher has indicated that it will sync
37-
run_once = 0x08, // Only run the benchmark once (for profiling).
38-
gpu = 0x10, // Don't instantiate `measure_cpu_only`.
39-
no_gpu = 0x20, // No GPU measurements should be instantiated.
40-
no_batch = 0x40, // `measure_hot` will not be used.
35+
sync = 0x02, // KernelLauncher has indicated that it will sync
36+
run_once = 0x04, // Only run the benchmark once (for profiling).
37+
gpu = 0x08, // Don't instantiate `measure_cpu_only`.
38+
no_gpu = 0x10, // No GPU measurements should be instantiated.
39+
no_batch = 0x20, // `measure_hot` will not be used.
4140
modifier_mask = 0xFF,
4241

4342
// Measurement types to instantiate. Derived from modifiers.
@@ -97,7 +96,6 @@ struct tag
9796

9897
using none_t = tag<nvbench::detail::exec_flag::none>;
9998
using timer_t = tag<nvbench::detail::exec_flag::timer>;
100-
using no_block_t = tag<nvbench::detail::exec_flag::no_block>;
10199
using sync_t = tag<nvbench::detail::exec_flag::sync>;
102100
using run_once_t = tag<nvbench::detail::exec_flag::run_once>;
103101
using gpu_t = tag<nvbench::detail::exec_flag::gpu>;
@@ -112,7 +110,6 @@ using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
112110

113111
constexpr inline none_t none;
114112
constexpr inline timer_t timer;
115-
constexpr inline no_block_t no_block;
116113
constexpr inline sync_t sync;
117114
constexpr inline run_once_t run_once;
118115
constexpr inline gpu_t gpu;
@@ -136,8 +133,7 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer | //
136133

137134
/// Modifier used to indicate that the KernelGenerator will perform CUDA
138135
/// synchronizations. Without this flag such benchmarks will deadlock.
139-
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
140-
nvbench::exec_tag::impl::sync | //
136+
constexpr inline auto sync = nvbench::exec_tag::impl::sync | //
141137
nvbench::exec_tag::impl::no_batch;
142138

143139
/// Modifier used to indicate that batched measurements should be disabled

0 commit comments

Comments
 (0)