[FGParallelRouter] Updated Barrier to C++20 Std Barrier

AlexandreSinger · AlexandreSinger · commit 571d3f4781c1 · 2025-05-17T13:56:20.000-04:00
The fine-grained parallel router was originally built before VTR
upgraded to C++20, so we had to roll our own barrier. We originally had
two barriers: spin barriers (thread spin on a lock while waiting) and a
"mutex" barrer (where threads wait on a condition variable and
potentially went to sleep).

Through experimentation, found that the choice of barrier implementation
did not matter; however, the standard barrier provides slight
performance improvements for very long routes and has a much cleaner
interface.

Moved the FG parallel router to the standard barrier. The old
implementations are left in as classes in case c++20 is not preferred
for some users.

Also added a QoR script to make parsing FG parallel router runs easier.
diff --git a/vpr/src/route/parallel_connection_router.h b/vpr/src/route/parallel_connection_router.h
@@ -7,6 +7,7 @@
 #include "multi_queue_d_ary_heap.h"
 
 #include <atomic>
+#include <barrier>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@@ -48,7 +49,6 @@ class spin_lock_t {
  * condition variable to coordinate thread synchronization.
  */
 class barrier_mutex_t {
-    // FIXME: Try std::barrier (since C++20) to replace this mutex barrier
     std::mutex mutex_;
     std::condition_variable cv_;
     size_t count_;
@@ -61,17 +61,22 @@ class barrier_mutex_t {
      * @param num_threads Number of threads that must call wait() before
      * any thread is allowed to proceed
      */
-    explicit barrier_mutex_t(size_t num_threads)
+    explicit inline barrier_mutex_t(size_t num_threads)
         : count_(num_threads)
         , max_count_(num_threads) {}
 
+    /**
+     * Initialization method goes unused by this barrier implementation.
+     */
+    inline void init() {}
+
     /**
      * @brief Blocks the calling thread until all threads have called wait()
      *
      * When the specified number of threads have called this method, all
      * threads are unblocked and the barrier is reset for the next use.
      */
-    void wait() {
+    inline void wait() {
         std::unique_lock<std::mutex> lock{mutex_};
         size_t gen = generation_;
         if (--count_ == 0) {
@@ -111,13 +116,13 @@ class barrier_spin_t {
      * @param num_threads Number of threads that must call wait() before
      * any thread is allowed to proceed
      */
-    explicit barrier_spin_t(size_t num_threads) { num_threads_ = num_threads; }
+    explicit inline barrier_spin_t(size_t num_threads) { num_threads_ = num_threads; }
 
     /**
      * @brief Initializes the thread-local sense flag
      * @note Should be called by each thread before first using the barrier.
      */
-    void init() {
+    inline void init() {
         local_sense_ = false;
     }
 
@@ -128,7 +133,7 @@ class barrier_spin_t {
      * to arrive unblocks all waiting threads. This method avoids using locks or
      * condition variables, making it potentially more efficient for short waits.
      */
-    void wait() {
+    inline void wait() {
         bool s = !local_sense_;
         local_sense_ = s;
         size_t num_arrivals = count_.fetch_add(1) + 1;
@@ -142,7 +147,41 @@ class barrier_spin_t {
     }
 };
 
-using barrier_t = barrier_spin_t; // Using the spin-based thread barrier
+/**
+ * @brief Thread barrier implementation using std::barrier
+ *
+ * It ensures all participating threads reach a synchronization point
+ * before any are allowed to proceed further.
+ */
+class standard_barrier_t {
+    /// @brief Internal barrier implementation.
+    std::barrier<> barrier_;
+
+  public:
+    /**
+     * @brief Constructs a barrier for a specific number of threads
+     *
+     *  @param num_threads
+     *      Number of threads that must call wait() before any thread is allowed
+     *      to proceed.
+     */
+    explicit inline standard_barrier_t(size_t num_threads)
+        : barrier_(num_threads) {}
+
+    /**
+     * Initialization method goes unused by this barrier implementation.
+     */
+    inline void init() {}
+
+    /**
+     * @brief Blocks the calling thread until all threads have called wait()
+     */
+    inline void wait() {
+        barrier_.arrive_and_wait();
+    }
+};
+
+using barrier_t = standard_barrier_t; // Using the standard thread barrier
 
 /**
  * @class ParallelConnectionRouter implements the MultiQueue-based parallel connection
diff --git a/vtr_flow/parse/qor_config/qor_fg_parallel_router_fixed_chan_width.txt b/vtr_flow/parse/qor_config/qor_fg_parallel_router_fixed_chan_width.txt
@@ -0,0 +1,10 @@
+# This collects QoR data that is interesting for the Fine-Grained Parallel
+# Router running on a fixed channel width.
+
+vpr_status;output.txt;vpr_status=(.*)
+crit_path_delay;vpr.out;Critical path: (.*) ns
+post_route_wirelength;vpr.out;\s*Total wirelength: (\d+)
+total_connection_pathsearch_time;vpr.out;.*Time spent on path search: (.*) seconds.
+route_runtime;vpr.out;Routing took (.*) seconds
+total_runtime;vpr.out;The entire flow of VPR took (.*) seconds
+magic_cookie;vpr.out;Serial number \(magic cookie\) for the routing is: (.*)