k-morozov · k-morozov · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/bench/readme.md b/bench/readme.md
@@ -14,6 +14,20 @@
 | IntrusiveThreadPool_task_100000/iterations:10/repeats:5_stddev | 20.8 ms | 5.86 ms| 5          |
 | IntrusiveThreadPool_task_100000/iterations:10/repeats:5_cv     | 6.50 %  | 6.31 % | 5          |
 
+### DistributedPool Task 100,000 (std::mutex)
+
+| Benchmark                                                       | Time    | CPU    | Iterations |
+|-----------------------------------------------------------------|---------|--------|------------|
+| DistributedPool_task_100000/iterations:10/repeats:5             | 254 ms  | 96.8 ms| 10         |
+| DistributedPool_task_100000/iterations:10/repeats:5             | 492 ms  | 196 ms | 10         |
+| DistributedPool_task_100000/iterations:10/repeats:5             | 563 ms  | 224 ms | 10         |
+| DistributedPool_task_100000/iterations:10/repeats:5             | 254 ms  | 118 ms | 10         |
+| DistributedPool_task_100000/iterations:10/repeats:5             | 175 ms  | 83.4 ms| 10         |
+| DistributedPool_task_100000/iterations:10/repeats:5_mean        | 348 ms  | 144 ms | 5          |
+| DistributedPool_task_100000/iterations:10/repeats:5_median      | 254 ms  | 118 ms | 5          |
+| DistributedPool_task_100000/iterations:10/repeats:5_stddev      | 169 ms  | 62.7 ms| 5          |
+| DistributedPool_task_100000/iterations:10/repeats:5_cv          | 48.67 % | 43.65 %| 5          |
+
 ### DistributedPool Task 100,000 (spinlock)
 
 | Benchmark                                                     | Time   | CPU     | Iterations |
@@ -42,19 +56,19 @@
 | DistributedPool_task_100000/iterations:10/repeats:5_stddev    | 108 ms | 45.5 ms| 5          |
 | DistributedPool_task_100000/iterations:10/repeats:5_cv        | 37.98 %| 39.18 %| 5          |
 
-### DistributedPool Task 100,000 (std::mutex)
+### DistributedPool Task 100,000 (spinlock with weak mm)
 
-| Benchmark                                                       | Time    | CPU    | Iterations |
-|-----------------------------------------------------------------|---------|--------|------------|
-| DistributedPool_task_100000/iterations:10/repeats:5             | 254 ms  | 96.8 ms| 10         |
-| DistributedPool_task_100000/iterations:10/repeats:5             | 492 ms  | 196 ms | 10         |
-| DistributedPool_task_100000/iterations:10/repeats:5             | 563 ms  | 224 ms | 10         |
-| DistributedPool_task_100000/iterations:10/repeats:5             | 254 ms  | 118 ms | 10         |
-| DistributedPool_task_100000/iterations:10/repeats:5             | 175 ms  | 83.4 ms| 10         |
-| DistributedPool_task_100000/iterations:10/repeats:5_mean        | 348 ms  | 144 ms | 5          |
-| DistributedPool_task_100000/iterations:10/repeats:5_median      | 254 ms  | 118 ms | 5          |
-| DistributedPool_task_100000/iterations:10/repeats:5_stddev      | 169 ms  | 62.7 ms| 5          |
-| DistributedPool_task_100000/iterations:10/repeats:5_cv          | 48.67 % | 43.65 %| 5          |
+| Benchmark | Time | CPU | Iterations |
+| --- | --- | --- | --- |
+| DistributedPool_task_100000/iterations:10/repeats:5 | 186 ms | 73.0 ms | 10 |
+| DistributedPool_task_100000/iterations:10/repeats:5 | 132 ms | 62.2 ms | 10 |
+| DistributedPool_task_100000/iterations:10/repeats:5 | 145 ms | 59.1 ms | 10 |
+| DistributedPool_task_100000/iterations:10/repeats:5 | 126 ms | 58.9 ms | 10 |
+| DistributedPool_task_100000/iterations:10/repeats:5 | 149 ms | 62.4 ms | 10 |
+| DistributedPool_task_100000/iterations:10/repeats:5_mean | 148 ms | 63.1 ms | 5 |
+| DistributedPool_task_100000/iterations:10/repeats:5_median | 145 ms | 62.2 ms | 5 |
+| DistributedPool_task_100000/iterations:10/repeats:5_stddev | 23.5 ms | 5.76 ms | 5 |
+| DistributedPool_task_100000/iterations:10/repeats:5_cv | 15.90 % | 9.13 % | 5 |
 
 ### Compare std::mutex and async_mutex with coro
 

diff --git a/src/components/sync/queue_spinlock.h b/src/components/sync/queue_spinlock.h
@@ -14,21 +14,23 @@ class QueueSpinLock final {
     public:
         explicit Guard(QueueSpinLock& host) : host(host) { host.Acquire(this); }
         ~Guard() {
-            if (is_owner) Release();
+            if (is_owner.load(std::memory_order_acquire)) Release();
         }
 
         void Release() {
             host.Release(this);
-            is_owner.store(false);
+            is_owner.store(false, std::memory_order_release);
         }
 
-        void SetOwner() { is_owner.store(true); }
+        void SetOwner() { is_owner.store(true, std::memory_order_release); }
 
-        void SetNext(Guard* guard) { next.store(guard); }
+        void SetNext(Guard* guard) { next.store(guard, std::memory_order_release); }
 
-        bool IsOwner() const { return is_owner.load(); }
+        bool IsOwner() const {
+            return is_owner.load(std::memory_order_acquire);
+        }
 
-        bool HasNext() const { return next.load() != nullptr; }
+        bool HasNext() const { return next.load(std::memory_order_acquire) != nullptr; }
 
         void SetNextOwner() { next.load()->SetOwner(); }
 
@@ -40,7 +42,7 @@ class QueueSpinLock final {
 
 private:
     void Acquire(Guard* guard) {
-        auto ancestor = tail_.exchange(guard);
+        auto ancestor = tail_.exchange(guard/*, std::memory_order_acquire*/);
         if (ancestor == nullptr) {
             guard->SetOwner();
             return;
@@ -58,7 +60,8 @@ class QueueSpinLock final {
         }
 
         Guard* old_guard = guard;
-        while (!tail_.compare_exchange_weak(old_guard, nullptr)) {
+        while (!tail_.compare_exchange_weak(old_guard, nullptr/*,
+                                            std::memory_order_release*/)) {
             if (guard->HasNext()) {
                 guard->SetNextOwner();
                 return;

diff --git a/src/fiber/awaiter/wait_group_awaiter.h b/src/fiber/awaiter/wait_group_awaiter.h
@@ -7,6 +7,7 @@
 #include <mutex>
 
 #include <components/intrusive/list.h>
+#include <components/sync/queue_spinlock.h>
 #include <fiber/awaiter/awaiter.h>
 
 namespace NFibers {
@@ -15,23 +16,23 @@ template <class W>
 class WaitGroupWaiter : public IAwaiter,
                         public NComponents::Node<WaitGroupWaiter<W>> {
 public:
-    using Guard = std::unique_lock<typename W::Spinlock>;
+    using Guard = NSync::QueueSpinLock::Guard;
 
-    WaitGroupWaiter(W* wg, Guard guard) : wg(wg), guard(std::move(guard)){};
+    WaitGroupWaiter(W* wg, Guard& guard) : wg(wg), guard(guard){};
 
     void AwaitSuspend(StoppedFiber fiber) override {
         assert(fiber.IsValid());
 
         stopped_fiber = fiber;
         wg->Park(this);
-        guard.release()->unlock();
+        guard.Release();
     }
 
     void Schedule() { stopped_fiber.Schedule(); }
 
 private:
     W* wg;
-    Guard guard;
+    Guard& guard;
     StoppedFiber stopped_fiber;
 };
 

diff --git a/src/fiber/sync/wait_group.cpp b/src/fiber/sync/wait_group.cpp
@@ -38,7 +38,7 @@ void WaitGroup::Done() {
 void WaitGroup::Wait() {
     Waiter::Guard guard(spinlock_);
     if (counter_ > 0) {
-        Waiter wg_waiter(this, std::move(guard));
+        Waiter wg_waiter(this, guard);
         Suspend(&wg_waiter);
     }
 }

diff --git a/src/fiber/sync/wait_group.h b/src/fiber/sync/wait_group.h
@@ -8,12 +8,13 @@
 
 #include <components/intrusive/list.h>
 #include <components/sync/spinLock.h>
+#include <components/sync/queue_spinlock.h>
 #include <fiber/awaiter/wait_group_awaiter.h>
 
 namespace NFibers {
 
 class WaitGroup {
-    using Spinlock = NSync::SpinLock;
+    using Spinlock = NSync::QueueSpinLock;
     using Waiter = WaitGroupWaiter<WaitGroup>;
 
     friend Waiter;