Merge pull request #203 from alliepiper/exec_tag_cleanup

alliepiper · web-flow · commit beca2c0038b6 · 2025-04-08T13:35:34.000-04:00
Clean up unnecessary exec_tags.
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
@@ -34,9 +34,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
     , m_launch{m_state.get_cuda_stream()}
     , m_criterion_params{exec_state.get_criterion_params()}
-    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
+        exec_state.get_stopping_criterion())}
+    , m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
     , m_run_once{exec_state.get_run_once()}
-    , m_no_block{exec_state.get_disable_blocking_kernel()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
@@ -54,8 +54,8 @@ struct measure_cold_base
   measure_cold_base &operator=(measure_cold_base &&)      = delete;
 
 protected:
-  template <bool use_blocking_kernel>
   struct kernel_launch_timer;
+  friend struct kernel_launch_timer;
 
   void check();
   void initialize();
@@ -89,8 +89,8 @@ protected:
   nvbench::criterion_params m_criterion_params;
   nvbench::stopping_criterion_base& m_stopping_criterion;
 
+  bool m_disable_blocking_kernel{false};
   bool m_run_once{false};
-  bool m_no_block{false};
 
   nvbench::int64_t m_min_samples{};
 
@@ -108,23 +108,23 @@ protected:
   bool m_max_time_exceeded{};
 };
 
-template <bool use_blocking_kernel>
 struct measure_cold_base::kernel_launch_timer
 {
   kernel_launch_timer(measure_cold_base &measure)
       : m_measure{measure}
+      , m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
   {}
 
   __forceinline__ void start()
   {
     m_measure.flush_device_l2();
     m_measure.sync_stream();
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
     {
       m_measure.block_stream();
     }
     m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
-    if constexpr (!use_blocking_kernel)
+    if (m_disable_blocking_kernel)
     {
       m_measure.m_cpu_timer.start();
     }
@@ -133,7 +133,7 @@ struct measure_cold_base::kernel_launch_timer
   __forceinline__ void stop()
   {
     m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
     {
       m_measure.m_cpu_timer.start();
       m_measure.unblock_stream();
@@ -144,9 +144,10 @@ struct measure_cold_base::kernel_launch_timer
 
 private:
   measure_cold_base &m_measure;
+  bool m_disable_blocking_kernel;
 };
 
-template <typename KernelLauncher, bool use_blocking_kernel>
+template <typename KernelLauncher>
 struct measure_cold : public measure_cold_base
 {
   measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -177,15 +178,15 @@ private:
       return;
     }
 
-    kernel_launch_timer<use_blocking_kernel> timer(*this);
+    kernel_launch_timer timer(*this);
 
     this->launch_kernel(timer);
     this->check_skip_time(m_cuda_timer.get_duration());
   }
 
   void run_trials()
   {
-    kernel_launch_timer<use_blocking_kernel> timer(*this);
+    kernel_launch_timer timer(*this);
     do
     {
       this->launch_kernel(timer);
diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh
@@ -80,10 +80,11 @@ protected:
   nvbench::int64_t m_total_samples{};
   nvbench::float64_t m_total_cuda_time{};
 
+  bool m_disable_blocking_kernel{false};
   bool m_max_time_exceeded{false};
 };
 
-template <typename KernelLauncher, bool use_blocking_kernel>
+template <typename KernelLauncher>
 struct measure_hot : public measure_hot_base
 {
   measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -105,7 +106,7 @@ private:
   // measurement.
   void run_warmup()
   {
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
     {
       this->block_stream();
     }
@@ -114,7 +115,7 @@ private:
     this->launch_kernel();
     m_cuda_timer.stop(m_launch.get_stream());
 
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
     {
       this->unblock_stream();
     }
@@ -137,7 +138,7 @@ private:
     {
       batch_size = std::max(batch_size, nvbench::int64_t{1});
 
-      if constexpr (use_blocking_kernel)
+      if (!m_disable_blocking_kernel)
       {
         // Block stream until some work is queued.
         // Limit the number of kernel executions while blocked to prevent
diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh
@@ -45,6 +45,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
 {
   using KL = typename std::remove_reference<KernelLauncher>::type;
   using namespace nvbench::exec_tag::impl;
+
   static_assert(is_exec_tag_v<ExecTags>,
                 "`ExecTags` argument must be a member (or combination of members) from "
                 "`nvbench::exec_tag`.");
@@ -55,41 +56,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
   constexpr auto modifier_tags = tags & modifier_mask;
   constexpr auto measure_tags  = tags & measure_mask;
 
-  if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
-  {
-    throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
-                             "`set_is_cpu_only(true)` is called when defining the benchmark.");
-  }
-
-  if ((modifier_tags & gpu) && this->get_is_cpu_only())
-  {
-    throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
-                             "`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
-  }
-
-  // "run once" should disable batch measurements:
-  // TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
-  // of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
-  // removed.
-  // TODO CPU measurements should support run_once as well.
-  if (!(modifier_tags & run_once) && this->get_run_once())
-  {
-    constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
-    this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
-    return;
-  }
-
-  // TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
-  // and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
-  // implies `no_block` when refactoring.
-  if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
-      this->get_disable_blocking_kernel())
-  {
-    constexpr auto no_block_tags = tags | no_block;
-    this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
-    return;
-  }
-
   // If no measurements selected, pick some defaults based on the modifiers:
   if constexpr (!measure_tags)
   {
@@ -123,6 +89,24 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     return;
   }
 
+  if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is called when defining the benchmark.");
+  }
+
+  if ((modifier_tags & gpu) && this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
+  }
+
+  // Syncing will cause the blocking kernel pattern to deadlock:
+  if constexpr (modifier_tags & sync)
+  {
+    this->set_disable_blocking_kernel(true);
+  }
+
   if (this->is_skipped())
   {
     return;
@@ -157,23 +141,18 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     {
       static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");
 
-      constexpr bool use_blocking_kernel = !(tags & no_block);
       if constexpr (tags & timer)
       {
-// Estimate bandwidth here
 #ifdef NVBENCH_HAS_CUPTI
-        if constexpr (!(modifier_tags & run_once))
+        if (this->is_cupti_required() && !this->get_run_once())
         {
-          if (this->is_cupti_required())
-          {
-            using measure_t = nvbench::detail::measure_cupti<KL>;
-            measure_t measure{*this, kernel_launcher};
-            measure();
-          }
+          using measure_t = nvbench::detail::measure_cupti<KL>;
+          measure_t measure{*this, kernel_launcher};
+          measure();
         }
 #endif
 
-        using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
+        using measure_t = nvbench::detail::measure_cold<KL>;
         measure_t measure{*this, kernel_launcher};
         measure();
       }
@@ -182,20 +161,16 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
         using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
         wrapper_t wrapper{kernel_launcher};
 
-// Estimate bandwidth here
 #ifdef NVBENCH_HAS_CUPTI
-        if constexpr (!(modifier_tags & run_once))
+        if (this->is_cupti_required() && !this->get_run_once())
         {
-          if (this->is_cupti_required())
-          {
-            using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
-            measure_t measure{*this, wrapper};
-            measure();
-          }
+          using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
+          measure_t measure{*this, wrapper};
+          measure();
         }
 #endif
 
-        using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+        using measure_t = nvbench::detail::measure_cold<wrapper_t>;
         measure_t measure(*this, wrapper);
         measure();
       }
@@ -207,10 +182,13 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
       static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
       static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
       static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
-      constexpr bool use_blocking_kernel = !(tags & no_block);
-      using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
-      measure_t measure{*this, kernel_launcher};
-      measure();
+
+      if (!this->get_run_once())
+      {
+        using measure_t = nvbench::detail::measure_hot<KL>;
+        measure_t measure{*this, kernel_launcher};
+        measure();
+      }
     }
   }
 }
diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh
@@ -32,12 +32,10 @@ enum class exec_flag
 
   // Modifiers:
   timer         = 0x01, // KernelLauncher uses manual timing
-  no_block      = 0x02, // Disables use of `blocking_kernel`.
-  sync          = 0x04, // KernelLauncher has indicated that it will sync
-  run_once      = 0x08, // Only run the benchmark once (for profiling).
-  gpu           = 0x10, // Don't instantiate `measure_cpu_only`.
-  no_gpu        = 0x20, // No GPU measurements should be instantiated.
-  no_batch      = 0x40, // `measure_hot` will not be used.
+  sync          = 0x02, // KernelLauncher has indicated that it will sync
+  gpu           = 0x04, // Don't instantiate `measure_cpu_only`.
+  no_gpu        = 0x08, // No GPU measurements should be instantiated.
+  no_batch      = 0x10, // `measure_hot` will not be used.
   modifier_mask = 0xFF,
 
   // Measurement types to instantiate. Derived from modifiers.
@@ -97,9 +95,7 @@ struct tag
 
 using none_t          = tag<nvbench::detail::exec_flag::none>;
 using timer_t         = tag<nvbench::detail::exec_flag::timer>;
-using no_block_t      = tag<nvbench::detail::exec_flag::no_block>;
 using sync_t          = tag<nvbench::detail::exec_flag::sync>;
-using run_once_t      = tag<nvbench::detail::exec_flag::run_once>;
 using gpu_t           = tag<nvbench::detail::exec_flag::gpu>;
 using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
 using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
@@ -112,9 +108,7 @@ using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
 
 constexpr inline none_t none;
 constexpr inline timer_t timer;
-constexpr inline no_block_t no_block;
 constexpr inline sync_t sync;
-constexpr inline run_once_t run_once;
 constexpr inline gpu_t gpu;
 constexpr inline no_gpu_t no_gpu;
 constexpr inline no_batch_t no_batch;
@@ -136,8 +130,7 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer | //
 
 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
-                             nvbench::exec_tag::impl::sync |     //
+constexpr inline auto sync = nvbench::exec_tag::impl::sync | //
                              nvbench::exec_tag::impl::no_batch;
 
 /// Modifier used to indicate that batched measurements should be disabled

Original file line number	Diff line number	Diff line change
`@@ -54,8 +54,8 @@ struct measure_cold_base`
`54`	`54`	`measure_cold_base &operator=(measure_cold_base &&) = delete;`
`55`	`55`
`56`	`56`	`protected:`
`57`		`- template <bool use_blocking_kernel>`
`58`	`57`	`struct kernel_launch_timer;`
	`58`	`+ friend struct kernel_launch_timer;`
`59`	`59`
`60`	`60`	`void check();`
`61`	`61`	`void initialize();`
`@@ -89,8 +89,8 @@ protected:`
`89`	`89`	`nvbench::criterion_params m_criterion_params;`
`90`	`90`	`nvbench::stopping_criterion_base& m_stopping_criterion;`
`91`	`91`
	`92`	`+ bool m_disable_blocking_kernel{false};`
`92`	`93`	`bool m_run_once{false};`
`93`		`- bool m_no_block{false};`
`94`	`94`
`95`	`95`	`nvbench::int64_t m_min_samples{};`
`96`	`96`
`@@ -108,23 +108,23 @@ protected:`
`108`	`108`	`bool m_max_time_exceeded{};`
`109`	`109`	`};`
`110`	`110`
`111`		`-template <bool use_blocking_kernel>`
`112`	`111`	`struct measure_cold_base::kernel_launch_timer`
`113`	`112`	`{`
`114`	`113`	`kernel_launch_timer(measure_cold_base &measure)`
`115`	`114`	`: m_measure{measure}`
	`115`	`+ , m_disable_blocking_kernel{measure.m_disable_blocking_kernel}`
`116`	`116`	`{}`
`117`	`117`
`118`	`118`	`__forceinline__ void start()`
`119`	`119`	`{`
`120`	`120`	`m_measure.flush_device_l2();`
`121`	`121`	`m_measure.sync_stream();`
`122`		`- if constexpr (use_blocking_kernel)`
	`122`	`+ if (!m_disable_blocking_kernel)`
`123`	`123`	`{`
`124`	`124`	`m_measure.block_stream();`
`125`	`125`	`}`
`126`	`126`	`m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());`
`127`		`- if constexpr (!use_blocking_kernel)`
	`127`	`+ if (m_disable_blocking_kernel)`
`128`	`128`	`{`
`129`	`129`	`m_measure.m_cpu_timer.start();`
`130`	`130`	`}`
`@@ -133,7 +133,7 @@ struct measure_cold_base::kernel_launch_timer`
`133`	`133`	`__forceinline__ void stop()`
`134`	`134`	`{`
`135`	`135`	`m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());`
`136`		`- if constexpr (use_blocking_kernel)`
	`136`	`+ if (!m_disable_blocking_kernel)`
`137`	`137`	`{`
`138`	`138`	`m_measure.m_cpu_timer.start();`
`139`	`139`	`m_measure.unblock_stream();`
`@@ -144,9 +144,10 @@ struct measure_cold_base::kernel_launch_timer`
`144`	`144`
`145`	`145`	`private:`
`146`	`146`	`measure_cold_base &m_measure;`
	`147`	`+ bool m_disable_blocking_kernel;`
`147`	`148`	`};`
`148`	`149`
`149`		`-template <typename KernelLauncher, bool use_blocking_kernel>`
	`150`	`+template <typename KernelLauncher>`
`150`	`151`	`struct measure_cold : public measure_cold_base`
`151`	`152`	`{`
`152`	`153`	`measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)`
`@@ -177,15 +178,15 @@ private:`
`177`	`178`	`return;`
`178`	`179`	`}`
`179`	`180`
`180`		`- kernel_launch_timer<use_blocking_kernel> timer(*this);`
	`181`	`+ kernel_launch_timer timer(*this);`
`181`	`182`
`182`	`183`	`this->launch_kernel(timer);`
`183`	`184`	`this->check_skip_time(m_cuda_timer.get_duration());`
`184`	`185`	`}`
`185`	`186`
`186`	`187`	`void run_trials()`
`187`	`188`	`{`
`188`		`- kernel_launch_timer<use_blocking_kernel> timer(*this);`
	`189`	`+ kernel_launch_timer timer(*this);`
`189`	`190`	`do`
`190`	`191`	`{`
`191`	`192`	`this->launch_kernel(timer);`
Original file line number	Diff line number	Diff line change
`@@ -80,10 +80,11 @@ protected:`
`80`	`80`	`nvbench::int64_t m_total_samples{};`
`81`	`81`	`nvbench::float64_t m_total_cuda_time{};`
`82`	`82`
	`83`	`+ bool m_disable_blocking_kernel{false};`
`83`	`84`	`bool m_max_time_exceeded{false};`
`84`	`85`	`};`
`85`	`86`
`86`		`-template <typename KernelLauncher, bool use_blocking_kernel>`
	`87`	`+template <typename KernelLauncher>`
`87`	`88`	`struct measure_hot : public measure_hot_base`
`88`	`89`	`{`
`89`	`90`	`measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)`
`@@ -105,7 +106,7 @@ private:`
`105`	`106`	`// measurement.`
`106`	`107`	`void run_warmup()`
`107`	`108`	`{`
`108`		`- if constexpr (use_blocking_kernel)`
	`109`	`+ if (!m_disable_blocking_kernel)`
`109`	`110`	`{`
`110`	`111`	`this->block_stream();`
`111`	`112`	`}`
`@@ -114,7 +115,7 @@ private:`
`114`	`115`	`this->launch_kernel();`
`115`	`116`	`m_cuda_timer.stop(m_launch.get_stream());`
`116`	`117`
`117`		`- if constexpr (use_blocking_kernel)`
	`118`	`+ if (!m_disable_blocking_kernel)`
`118`	`119`	`{`
`119`	`120`	`this->unblock_stream();`
`120`	`121`	`}`
`@@ -137,7 +138,7 @@ private:`
`137`	`138`	`{`
`138`	`139`	`batch_size = std::max(batch_size, nvbench::int64_t{1});`
`139`	`140`
`140`		`- if constexpr (use_blocking_kernel)`
	`141`	`+ if (!m_disable_blocking_kernel)`
`141`	`142`	`{`
`142`	`143`	`// Block stream until some work is queued.`
`143`	`144`	`// Limit the number of kernel executions while blocked to prevent`