1212
1313use std:: collections:: HashMap ;
1414use std:: sync:: atomic:: { AtomicBool , AtomicU64 , Ordering } ;
15- use std:: sync:: { Arc , Condvar , Mutex } ;
15+ use std:: sync:: { Condvar , Mutex } ;
1616use std:: time:: { Duration , Instant } ;
1717
1818/// ═══════════════════════════════════════════════════════════════════════
19- /// GROUP COMMIT ENGINE
19+ /// GROUP COMMIT ENGINE — v2 (No-Sleep Design)
2020/// ═══════════════════════════════════════════════════════════════════════
2121///
22- /// Instead of calling fsync() for every single commit, the group commit
23- /// engine collects pending writes and issues a single fsync for the entire
24- /// batch. This is how PostgreSQL, MySQL InnoDB, and RocksDB achieve high
25- /// write throughput.
22+ /// Coalesces concurrent fsync calls into a single fsync per batch.
2623///
27- /// ## How it works :
24+ /// ## Design (no sleep, no timed wait) :
2825///
29- /// 1. Writer arrives and joins the current write group.
30- /// 2. First writer in the group becomes the "leader".
31- /// 3. Leader waits briefly (configurable, default 200µs) for more writers.
32- /// 4. Leader issues one fsync for all writers in the group.
33- /// 5. All writers in the group are notified of completion.
26+ /// 1. Each writer appends data to heap + WAL (no fsync yet).
27+ /// 2. Writer enters `join_group()`.
28+ /// 3. If no sync is in progress → become leader, sync immediately.
29+ /// 4. If a sync IS in progress → wait as follower.
30+ /// 5. When the leader's sync completes, ALL followers are released.
31+ /// 6. Natural batching: while leader fsyncs (~2ms), new writers queue up.
32+ /// Next leader syncs for everyone who arrived during those 2ms.
3433///
35- /// Under 1000 concurrent writers, this reduces fsyncs from 1000 to ~5-10.
34+ /// This achieves the same throughput as a timed-wait design without the
35+ /// latency overhead of sleeping on every single-threaded write.
3636
3737pub struct GroupCommitEngine {
38- /// Maximum time to wait for group to fill (microseconds).
39- max_wait_us : u64 ,
4038 /// State of the current write group.
4139 state : Mutex < GroupState > ,
42- /// Condition variable for waiting writers .
40+ /// Condition variable for waiting followers .
4341 cond : Condvar ,
44- /// Monotonic epoch counter — increments on each group commit .
42+ /// Monotonic epoch counter — increments on each completed sync .
4543 epoch : AtomicU64 ,
46- /// Whether the engine is active.
47- active : AtomicBool ,
4844}
4945
5046struct GroupState {
51- /// Number of pending writers in the current group.
47+ /// Number of writers waiting in the current group (including leader) .
5248 pending_count : usize ,
5349 /// The epoch that was last committed.
5450 committed_epoch : u64 ,
@@ -58,48 +54,46 @@ struct GroupState {
5854
5955impl GroupCommitEngine {
6056 /// Creates a new GroupCommitEngine.
61- ///
62- /// `max_wait_us` — maximum microseconds to wait for group to fill.
63- /// Typical values: 100-500µs for SSDs, 1000-5000µs for HDDs.
64- pub fn new ( max_wait_us : u64 ) -> Self {
57+ pub fn new ( _max_wait_us : u64 ) -> Self {
6558 Self {
66- max_wait_us,
6759 state : Mutex :: new ( GroupState {
6860 pending_count : 0 ,
6961 committed_epoch : 0 ,
7062 sync_in_progress : false ,
7163 } ) ,
7264 cond : Condvar :: new ( ) ,
7365 epoch : AtomicU64 :: new ( 1 ) ,
74- active : AtomicBool :: new ( true ) ,
7566 }
7667 }
7768
78- /// Called by each writer to join a write group and wait for fsync.
69+ /// Join the current write group.
70+ ///
71+ /// Returns a guard indicating whether this writer is the leader.
72+ /// - Leader: must perform fsync, then call `guard.mark_synced()`.
73+ /// - Follower: blocks until the leader's sync completes, then returns.
7974 ///
80- /// Returns `true` if this writer should perform the fsync (it's the leader),
81- /// or `false` if the fsync was already done by the leader.
82- pub fn join_group ( & self ) -> GroupCommitGuard {
75+ /// No sleep, no timed wait. The leader syncs immediately.
76+ /// Natural batching occurs because followers accumulate during the
77+ /// ~2ms fsync window.
78+ pub fn join_group ( & self ) -> GroupCommitGuard < ' _ > {
8379 let my_epoch = self . epoch . load ( Ordering :: SeqCst ) ;
8480
8581 let mut state = self . state . lock ( ) . expect ( "group state" ) ;
8682 state. pending_count += 1 ;
87- let is_leader = state. pending_count == 1 && !state. sync_in_progress ;
8883
89- if is_leader {
84+ if !state. sync_in_progress {
85+ // No sync running → I'm the leader. Start syncing immediately.
9086 state. sync_in_progress = true ;
9187 drop ( state) ;
9288
93- // Leader waits briefly for more writers to join
94- std:: thread:: sleep ( Duration :: from_micros ( self . max_wait_us ) ) ;
95-
89+ // No sleep! Leader proceeds directly to fsync.
9690 GroupCommitGuard {
9791 engine : self ,
98- epoch : my_epoch,
9992 is_leader : true ,
10093 }
10194 } else {
102- // Follower: wait for the leader to complete the sync
95+ // A sync is already in progress → wait as follower.
96+ // The leader will wake us when done.
10397 while state. committed_epoch < my_epoch {
10498 state = self . cond . wait ( state) . expect ( "condvar wait" ) ;
10599 }
@@ -108,45 +102,43 @@ impl GroupCommitEngine {
108102
109103 GroupCommitGuard {
110104 engine : self ,
111- epoch : my_epoch,
112105 is_leader : false ,
113106 }
114107 }
115108 }
116109
117110 /// Called by the leader after performing the actual fsync.
118- pub fn complete_sync ( & self ) {
111+ fn complete_sync ( & self ) {
119112 let new_epoch = self . epoch . fetch_add ( 1 , Ordering :: SeqCst ) ;
120113
121114 let mut state = self . state . lock ( ) . expect ( "group state" ) ;
122115 state. committed_epoch = new_epoch;
123116 state. sync_in_progress = false ;
124- // Leader counts itself
125117 state. pending_count -= 1 ;
126118 drop ( state) ;
127119
128120 // Wake all waiting followers
129121 self . cond . notify_all ( ) ;
130122 }
131123
132- /// Returns the current group commit statistics .
124+ /// Returns (committed_epoch, pending_count) .
133125 pub fn stats ( & self ) -> ( u64 , usize ) {
134126 let state = self . state . lock ( ) . expect ( "group state" ) ;
135127 ( state. committed_epoch , state. pending_count )
136128 }
137129}
138130
139- /// Guard returned by `join_group()`. Check `is_leader` to determine
140- /// whether this writer should perform the fsync.
131+ /// Guard returned by `join_group()`.
132+ /// If `is_leader` is true, perform fsync then call `mark_synced()`.
133+ /// If `is_leader` is false, the sync is already done — just proceed.
141134pub struct GroupCommitGuard < ' a > {
142135 engine : & ' a GroupCommitEngine ,
143- pub epoch : u64 ,
144- /// If true, this writer is the leader and should call fsync.
136+ /// If true, this writer must perform the fsync.
145137 pub is_leader : bool ,
146138}
147139
148- impl < ' a > GroupCommitGuard < ' a > {
149- /// Call this after performing fsync (leader only).
140+ impl GroupCommitGuard < ' _ > {
141+ /// Call after performing fsync (leader only). Wakes all followers .
150142 pub fn mark_synced ( self ) {
151143 if self . is_leader {
152144 self . engine . complete_sync ( ) ;
@@ -211,7 +203,6 @@ impl RateLimiter {
211203
212204 // Evict oldest bucket if at capacity
213205 if buckets. len ( ) >= self . max_users && !buckets. contains_key ( user_id) {
214- // Simple eviction: remove the user with the oldest last_refill
215206 let oldest = buckets
216207 . iter ( )
217208 . min_by_key ( |( _, b) | b. last_refill )
@@ -235,7 +226,6 @@ impl RateLimiter {
235226 bucket. tokens -= 1.0 ;
236227 Ok ( bucket. tokens as u32 )
237228 } else {
238- // Calculate retry-after time
239229 let deficit = 1.0 - bucket. tokens ;
240230 let retry_ms = ( deficit / rate * 1000.0 ) as u64 ;
241231 Err ( retry_ms. max ( 1 ) )
0 commit comments