Skip to content

Commit beca2c0

Browse files
authored
Merge pull request #203 from alliepiper/exec_tag_cleanup
Clean up unnecessary exec_tags.
2 parents 52028be + 3536061 commit beca2c0

File tree

5 files changed

+59
-85
lines changed

5 files changed

+59
-85
lines changed

nvbench/detail/measure_cold.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
3434
: m_state{exec_state}
3535
, m_launch{m_state.get_cuda_stream()}
3636
, m_criterion_params{exec_state.get_criterion_params()}
37-
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
37+
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
38+
exec_state.get_stopping_criterion())}
39+
, m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
3840
, m_run_once{exec_state.get_run_once()}
39-
, m_no_block{exec_state.get_disable_blocking_kernel()}
4041
, m_min_samples{exec_state.get_min_samples()}
4142
, m_skip_time{exec_state.get_skip_time()}
4243
, m_timeout{exec_state.get_timeout()}

nvbench/detail/measure_cold.cuh

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ struct measure_cold_base
5454
measure_cold_base &operator=(measure_cold_base &&) = delete;
5555

5656
protected:
57-
template <bool use_blocking_kernel>
5857
struct kernel_launch_timer;
58+
friend struct kernel_launch_timer;
5959

6060
void check();
6161
void initialize();
@@ -89,8 +89,8 @@ protected:
8989
nvbench::criterion_params m_criterion_params;
9090
nvbench::stopping_criterion_base& m_stopping_criterion;
9191

92+
bool m_disable_blocking_kernel{false};
9293
bool m_run_once{false};
93-
bool m_no_block{false};
9494

9595
nvbench::int64_t m_min_samples{};
9696

@@ -108,23 +108,23 @@ protected:
108108
bool m_max_time_exceeded{};
109109
};
110110

111-
template <bool use_blocking_kernel>
112111
struct measure_cold_base::kernel_launch_timer
113112
{
114113
kernel_launch_timer(measure_cold_base &measure)
115114
: m_measure{measure}
115+
, m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
116116
{}
117117

118118
__forceinline__ void start()
119119
{
120120
m_measure.flush_device_l2();
121121
m_measure.sync_stream();
122-
if constexpr (use_blocking_kernel)
122+
if (!m_disable_blocking_kernel)
123123
{
124124
m_measure.block_stream();
125125
}
126126
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
127-
if constexpr (!use_blocking_kernel)
127+
if (m_disable_blocking_kernel)
128128
{
129129
m_measure.m_cpu_timer.start();
130130
}
@@ -133,7 +133,7 @@ struct measure_cold_base::kernel_launch_timer
133133
__forceinline__ void stop()
134134
{
135135
m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
136-
if constexpr (use_blocking_kernel)
136+
if (!m_disable_blocking_kernel)
137137
{
138138
m_measure.m_cpu_timer.start();
139139
m_measure.unblock_stream();
@@ -144,9 +144,10 @@ struct measure_cold_base::kernel_launch_timer
144144

145145
private:
146146
measure_cold_base &m_measure;
147+
bool m_disable_blocking_kernel;
147148
};
148149

149-
template <typename KernelLauncher, bool use_blocking_kernel>
150+
template <typename KernelLauncher>
150151
struct measure_cold : public measure_cold_base
151152
{
152153
measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -177,15 +178,15 @@ private:
177178
return;
178179
}
179180

180-
kernel_launch_timer<use_blocking_kernel> timer(*this);
181+
kernel_launch_timer timer(*this);
181182

182183
this->launch_kernel(timer);
183184
this->check_skip_time(m_cuda_timer.get_duration());
184185
}
185186

186187
void run_trials()
187188
{
188-
kernel_launch_timer<use_blocking_kernel> timer(*this);
189+
kernel_launch_timer timer(*this);
189190
do
190191
{
191192
this->launch_kernel(timer);

nvbench/detail/measure_hot.cuh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,11 @@ protected:
8080
nvbench::int64_t m_total_samples{};
8181
nvbench::float64_t m_total_cuda_time{};
8282

83+
bool m_disable_blocking_kernel{false};
8384
bool m_max_time_exceeded{false};
8485
};
8586

86-
template <typename KernelLauncher, bool use_blocking_kernel>
87+
template <typename KernelLauncher>
8788
struct measure_hot : public measure_hot_base
8889
{
8990
measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -105,7 +106,7 @@ private:
105106
// measurement.
106107
void run_warmup()
107108
{
108-
if constexpr (use_blocking_kernel)
109+
if (!m_disable_blocking_kernel)
109110
{
110111
this->block_stream();
111112
}
@@ -114,7 +115,7 @@ private:
114115
this->launch_kernel();
115116
m_cuda_timer.stop(m_launch.get_stream());
116117

117-
if constexpr (use_blocking_kernel)
118+
if (!m_disable_blocking_kernel)
118119
{
119120
this->unblock_stream();
120121
}
@@ -137,7 +138,7 @@ private:
137138
{
138139
batch_size = std::max(batch_size, nvbench::int64_t{1});
139140

140-
if constexpr (use_blocking_kernel)
141+
if (!m_disable_blocking_kernel)
141142
{
142143
// Block stream until some work is queued.
143144
// Limit the number of kernel executions while blocked to prevent

nvbench/detail/state_exec.cuh

Lines changed: 36 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
4545
{
4646
using KL = typename std::remove_reference<KernelLauncher>::type;
4747
using namespace nvbench::exec_tag::impl;
48+
4849
static_assert(is_exec_tag_v<ExecTags>,
4950
"`ExecTags` argument must be a member (or combination of members) from "
5051
"`nvbench::exec_tag`.");
@@ -55,41 +56,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
5556
constexpr auto modifier_tags = tags & modifier_mask;
5657
constexpr auto measure_tags = tags & measure_mask;
5758

58-
if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
59-
{
60-
throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
61-
"`set_is_cpu_only(true)` is called when defining the benchmark.");
62-
}
63-
64-
if ((modifier_tags & gpu) && this->get_is_cpu_only())
65-
{
66-
throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
67-
"`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
68-
}
69-
70-
// "run once" should disable batch measurements:
71-
// TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
72-
// of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
73-
// removed.
74-
// TODO CPU measurements should support run_once as well.
75-
if (!(modifier_tags & run_once) && this->get_run_once())
76-
{
77-
constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
78-
this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
79-
return;
80-
}
81-
82-
// TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
83-
// and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
84-
// implies `no_block` when refactoring.
85-
if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
86-
this->get_disable_blocking_kernel())
87-
{
88-
constexpr auto no_block_tags = tags | no_block;
89-
this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
90-
return;
91-
}
92-
9359
// If no measurements selected, pick some defaults based on the modifiers:
9460
if constexpr (!measure_tags)
9561
{
@@ -123,6 +89,24 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
12389
return;
12490
}
12591

92+
if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
93+
{
94+
throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
95+
"`set_is_cpu_only(true)` is called when defining the benchmark.");
96+
}
97+
98+
if ((modifier_tags & gpu) && this->get_is_cpu_only())
99+
{
100+
throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
101+
"`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
102+
}
103+
104+
// Syncing will cause the blocking kernel pattern to deadlock:
105+
if constexpr (modifier_tags & sync)
106+
{
107+
this->set_disable_blocking_kernel(true);
108+
}
109+
126110
if (this->is_skipped())
127111
{
128112
return;
@@ -157,23 +141,18 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
157141
{
158142
static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");
159143

160-
constexpr bool use_blocking_kernel = !(tags & no_block);
161144
if constexpr (tags & timer)
162145
{
163-
// Estimate bandwidth here
164146
#ifdef NVBENCH_HAS_CUPTI
165-
if constexpr (!(modifier_tags & run_once))
147+
if (this->is_cupti_required() && !this->get_run_once())
166148
{
167-
if (this->is_cupti_required())
168-
{
169-
using measure_t = nvbench::detail::measure_cupti<KL>;
170-
measure_t measure{*this, kernel_launcher};
171-
measure();
172-
}
149+
using measure_t = nvbench::detail::measure_cupti<KL>;
150+
measure_t measure{*this, kernel_launcher};
151+
measure();
173152
}
174153
#endif
175154

176-
using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
155+
using measure_t = nvbench::detail::measure_cold<KL>;
177156
measure_t measure{*this, kernel_launcher};
178157
measure();
179158
}
@@ -182,20 +161,16 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
182161
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
183162
wrapper_t wrapper{kernel_launcher};
184163

185-
// Estimate bandwidth here
186164
#ifdef NVBENCH_HAS_CUPTI
187-
if constexpr (!(modifier_tags & run_once))
165+
if (this->is_cupti_required() && !this->get_run_once())
188166
{
189-
if (this->is_cupti_required())
190-
{
191-
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
192-
measure_t measure{*this, wrapper};
193-
measure();
194-
}
167+
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
168+
measure_t measure{*this, wrapper};
169+
measure();
195170
}
196171
#endif
197172

198-
using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
173+
using measure_t = nvbench::detail::measure_cold<wrapper_t>;
199174
measure_t measure(*this, wrapper);
200175
measure();
201176
}
@@ -207,10 +182,13 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
207182
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
208183
static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
209184
static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
210-
constexpr bool use_blocking_kernel = !(tags & no_block);
211-
using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
212-
measure_t measure{*this, kernel_launcher};
213-
measure();
185+
186+
if (!this->get_run_once())
187+
{
188+
using measure_t = nvbench::detail::measure_hot<KL>;
189+
measure_t measure{*this, kernel_launcher};
190+
measure();
191+
}
214192
}
215193
}
216194
}

nvbench/exec_tag.cuh

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,10 @@ enum class exec_flag
3232

3333
// Modifiers:
3434
timer = 0x01, // KernelLauncher uses manual timing
35-
no_block = 0x02, // Disables use of `blocking_kernel`.
36-
sync = 0x04, // KernelLauncher has indicated that it will sync
37-
run_once = 0x08, // Only run the benchmark once (for profiling).
38-
gpu = 0x10, // Don't instantiate `measure_cpu_only`.
39-
no_gpu = 0x20, // No GPU measurements should be instantiated.
40-
no_batch = 0x40, // `measure_hot` will not be used.
35+
sync = 0x02, // KernelLauncher has indicated that it will sync
36+
gpu = 0x04, // Don't instantiate `measure_cpu_only`.
37+
no_gpu = 0x08, // No GPU measurements should be instantiated.
38+
no_batch = 0x10, // `measure_hot` will not be used.
4139
modifier_mask = 0xFF,
4240

4341
// Measurement types to instantiate. Derived from modifiers.
@@ -97,9 +95,7 @@ struct tag
9795

9896
using none_t = tag<nvbench::detail::exec_flag::none>;
9997
using timer_t = tag<nvbench::detail::exec_flag::timer>;
100-
using no_block_t = tag<nvbench::detail::exec_flag::no_block>;
10198
using sync_t = tag<nvbench::detail::exec_flag::sync>;
102-
using run_once_t = tag<nvbench::detail::exec_flag::run_once>;
10399
using gpu_t = tag<nvbench::detail::exec_flag::gpu>;
104100
using no_gpu_t = tag<nvbench::detail::exec_flag::no_gpu>;
105101
using no_batch_t = tag<nvbench::detail::exec_flag::no_batch>;
@@ -112,9 +108,7 @@ using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
112108

113109
constexpr inline none_t none;
114110
constexpr inline timer_t timer;
115-
constexpr inline no_block_t no_block;
116111
constexpr inline sync_t sync;
117-
constexpr inline run_once_t run_once;
118112
constexpr inline gpu_t gpu;
119113
constexpr inline no_gpu_t no_gpu;
120114
constexpr inline no_batch_t no_batch;
@@ -136,8 +130,7 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer | //
136130

137131
/// Modifier used to indicate that the KernelGenerator will perform CUDA
138132
/// synchronizations. Without this flag such benchmarks will deadlock.
139-
constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
140-
nvbench::exec_tag::impl::sync | //
133+
constexpr inline auto sync = nvbench::exec_tag::impl::sync | //
141134
nvbench::exec_tag::impl::no_batch;
142135

143136
/// Modifier used to indicate that batched measurements should be disabled

0 commit comments

Comments
 (0)