@@ -110,32 +110,40 @@ fn quarantine_entry(quarantine_dir: &std::path::Path, entry: &WalEntry, kind: &s
110110/// `snapshot_stats()` and rendered as rows by `timefusion.stats()`.
111111#[ derive( Debug , Clone ) ]
112112pub struct StatsSnapshot {
113- pub mem_project_count : usize ,
114- pub mem_total_buckets : usize ,
115- pub mem_total_rows : usize ,
116- pub mem_total_batches : usize ,
117- pub mem_estimated_bytes : usize ,
118- pub reserved_bytes : usize ,
119- pub max_memory_bytes : usize ,
120- pub pressure_pct : u32 ,
121- pub wal_files : usize ,
122- pub wal_disk_bytes : u64 ,
123- pub wal_shards_per_topic : usize ,
124- pub wal_known_topics : usize ,
125- pub bucket_duration_micros : i64 ,
113+ pub mem_project_count : usize ,
114+ pub mem_total_buckets : usize ,
115+ pub mem_total_rows : usize ,
116+ pub mem_total_batches : usize ,
117+ pub mem_estimated_bytes : usize ,
118+ pub reserved_bytes : usize ,
119+ pub max_memory_bytes : usize ,
120+ pub pressure_pct : u32 ,
121+ pub wal_files : usize ,
122+ pub wal_disk_bytes : u64 ,
123+ pub wal_shards_per_topic : usize ,
124+ pub wal_known_topics : usize ,
125+ pub bucket_duration_micros : i64 ,
126126 /// Age of the oldest bucket in MemBuffer (seconds, computed from
127127 /// `now - min(bucket.min_timestamp)`). None when MemBuffer is empty.
128128 /// Alerting target: alert at > 2× `flush_interval_secs`.
129- pub oldest_bucket_age_secs : Option < u64 > ,
129+ pub oldest_bucket_age_secs : Option < u64 > ,
130130 /// Cumulative flush successes/failures since process start. Mirrors the
131131 /// OTel `timefusion.flush.completed`/`failed` counters so tests can
132132 /// assert without configuring OTel.
133- pub flush_completed_total : u64 ,
134- pub flush_failed_total : u64 ,
133+ pub flush_completed_total : u64 ,
134+ pub flush_failed_total : u64 ,
135135 /// Times an insert hit the memory hard limit and applied backpressure
136136 /// (synchronous flush-to-Delta) instead of rejecting. Sustained growth =
137137 /// ingest outpacing flush; the matching OTel counter is the alert target.
138- pub backpressure_engaged_total : u64 ,
138+ pub backpressure_engaged_total : u64 ,
139+ /// Inserts rejected after the backpressure window expired without freeing
140+ /// memory — Delta flush isn't keeping up. PAGE on any growth (data is still
141+ /// in the WAL but ingest is now dropping). Mirrored from OTel so operators
142+ /// can watch it via the stats table when telemetry isn't wired.
143+ pub backpressure_rejected_total : u64 ,
144+ /// Open-bucket force-flush escalations (a single busy window was itself the
145+ /// pressure). Sustained growth = windows too large for the budget.
146+ pub backpressure_force_flush_total : u64 ,
139147}
140148
141149#[ derive( Debug , Default ) ]
@@ -193,6 +201,10 @@ struct CoalescedGroup {
193201 wal_positions : Vec < Option < walrus_rust:: WalPosition > > ,
194202 /// Source bucket_ids; drained from MemBuffer after the combined commit succeeds.
195203 source_bucket_ids : Vec < i64 > ,
204+ /// Min/max timestamp across absorbed buckets (Option so the derived Default's
205+ /// 0 can't corrupt the min). Carried onto the combined FlushableBucket.
206+ min_timestamp : Option < i64 > ,
207+ max_timestamp : Option < i64 > ,
196208}
197209
198210struct CombinedBucket {
@@ -225,6 +237,8 @@ impl CoalescedGroup {
225237 }
226238 // Merge per-shard positions (max).
227239 self . wal_positions = merge_wal_positions ( std:: mem:: take ( & mut self . wal_positions ) , b. wal_positions ) ;
240+ self . min_timestamp = Some ( self . min_timestamp . map_or ( b. min_timestamp , |m| m. min ( b. min_timestamp ) ) ) ;
241+ self . max_timestamp = Some ( self . max_timestamp . map_or ( b. max_timestamp , |m| m. max ( b. max_timestamp ) ) ) ;
228242 self . source_bucket_ids . push ( b. bucket_id ) ;
229243 }
230244
@@ -236,6 +250,8 @@ impl CoalescedGroup {
236250 wal_shard_counts,
237251 wal_positions,
238252 source_bucket_ids,
253+ min_timestamp,
254+ max_timestamp,
239255 } = self ;
240256 // `absorb` is only called via `groups.entry(..).or_default().absorb(b)`
241257 // so `key` is always set by the time we collapse the group.
@@ -251,6 +267,8 @@ impl CoalescedGroup {
251267 row_count,
252268 wal_shard_counts,
253269 wal_positions,
270+ min_timestamp : min_timestamp. unwrap_or ( i64:: MAX ) ,
271+ max_timestamp : max_timestamp. unwrap_or ( i64:: MIN ) ,
254272 } ;
255273 CombinedBucket { combined, source_bucket_ids }
256274 }
@@ -267,38 +285,40 @@ pub type TantivyIndexCallback =
267285 Arc < dyn Fn ( String , String , Vec < RecordBatch > , Vec < String > ) -> futures:: future:: BoxFuture < ' static , anyhow:: Result < ( ) > > + Send + Sync > ;
268286
269287pub struct BufferedWriteLayer {
270- config : Arc < AppConfig > ,
271- wal : Arc < WalManager > ,
272- mem_buffer : Arc < MemBuffer > ,
273- shutdown : CancellationToken ,
274- delta_write_callback : Option < DeltaWriteCallback > ,
275- tantivy_index_callback : Option < TantivyIndexCallback > ,
276- background_tasks : Mutex < Vec < JoinHandle < ( ) > > > ,
277- flush_lock : Mutex < ( ) > ,
278- reserved_bytes : AtomicUsize , // Memory reserved for in-flight writes
279- pressure_notify : Arc < Notify > , // Wakes flush task when pressure threshold crossed
288+ config : Arc < AppConfig > ,
289+ wal : Arc < WalManager > ,
290+ mem_buffer : Arc < MemBuffer > ,
291+ shutdown : CancellationToken ,
292+ delta_write_callback : Option < DeltaWriteCallback > ,
293+ tantivy_index_callback : Option < TantivyIndexCallback > ,
294+ background_tasks : Mutex < Vec < JoinHandle < ( ) > > > ,
295+ flush_lock : Mutex < ( ) > ,
296+ reserved_bytes : AtomicUsize , // Memory reserved for in-flight writes
297+ pressure_notify : Arc < Notify > , // Wakes flush task when pressure threshold crossed
280298 /// Notified at the end of every flush task iteration (success or failure).
281299 /// Test hook: lets E2E harnesses await actual completion of background work
282300 /// instead of racing wall-clock sleeps.
283- flush_tick_notify : Arc < Notify > ,
301+ flush_tick_notify : Arc < Notify > ,
284302 /// Notified at the end of every eviction task iteration.
285- eviction_tick_notify : Arc < Notify > ,
303+ eviction_tick_notify : Arc < Notify > ,
286304 /// Cumulative flush counters mirrored alongside OTel `record_flush`.
287305 /// OTel global metric state is opt-in (only initialized when telemetry is
288306 /// configured), so these atomics give the harness an in-process way to
289307 /// assert on what the global counters would be.
290- flush_completed_total : AtomicU64 ,
291- flush_failed_total : AtomicU64 ,
292- backpressure_engaged_total : AtomicU64 ,
308+ flush_completed_total : AtomicU64 ,
309+ flush_failed_total : AtomicU64 ,
310+ backpressure_engaged_total : AtomicU64 ,
311+ backpressure_rejected_total : AtomicU64 ,
312+ backpressure_force_flush_total : AtomicU64 ,
293313 // Required for WAL replay of UPDATE/DELETE whose SQL references UDFs.
294- function_registry : Arc < crate :: functions:: FnRegistry > ,
314+ function_registry : Arc < crate :: functions:: FnRegistry > ,
295315 /// Caps concurrent detached tantivy sidecar builds so a fast flush cycle
296316 /// (post-F4 — one build per (project, table) per cycle) can't fan out
297317 /// past S3 connection / memory limits when many tables flush together.
298318 /// FOLLOW-UP: handles aren't stored; graceful shutdown does not await
299319 /// in-flight tantivy uploads. Acceptable for now because the sidecar is
300320 /// best-effort and the index can be rebuilt from Delta on demand.
301- tantivy_spawn_sem : Arc < tokio:: sync:: Semaphore > ,
321+ tantivy_spawn_sem : Arc < tokio:: sync:: Semaphore > ,
302322}
303323
304324impl std:: fmt:: Debug for BufferedWriteLayer {
@@ -342,6 +362,8 @@ impl BufferedWriteLayer {
342362 flush_completed_total : AtomicU64 :: new ( 0 ) ,
343363 flush_failed_total : AtomicU64 :: new ( 0 ) ,
344364 backpressure_engaged_total : AtomicU64 :: new ( 0 ) ,
365+ backpressure_rejected_total : AtomicU64 :: new ( 0 ) ,
366+ backpressure_force_flush_total : AtomicU64 :: new ( 0 ) ,
345367 function_registry,
346368 // 16 is well above realistic per-cycle table fan-out for the
347369 // monoscope workload (~5 distinct table names) while still
@@ -507,6 +529,7 @@ impl BufferedWriteLayer {
507529 last_mem = now_mem;
508530 if std:: time:: Instant :: now ( ) >= deadline {
509531 crate :: metrics:: record_backpressure_rejected ( ) ;
532+ self . backpressure_rejected_total . fetch_add ( 1 , Ordering :: Relaxed ) ;
510533 error ! (
511534 "Write backpressure exhausted after {:?}: used={}MB still over hard limit — Delta flush is not freeing memory; rejecting (data remains in WAL)" ,
512535 timeout,
@@ -563,6 +586,7 @@ impl BufferedWriteLayer {
563586 return Ok ( ( ) ) ;
564587 }
565588 crate :: metrics:: record_backpressure_force_flush ( ) ;
589+ self . backpressure_force_flush_total . fetch_add ( 1 , Ordering :: Relaxed ) ;
566590 for ( project_id, table_name, bucket_id) in self . mem_buffer . current_bucket_keys ( current) {
567591 let Some ( bucket) = self . mem_buffer . take_bucket_for_flush ( & project_id, & table_name, bucket_id) else {
568592 continue ;
@@ -1428,6 +1452,8 @@ impl BufferedWriteLayer {
14281452 flush_completed_total : self . flush_completed_total . load ( Ordering :: Relaxed ) ,
14291453 flush_failed_total : self . flush_failed_total . load ( Ordering :: Relaxed ) ,
14301454 backpressure_engaged_total : self . backpressure_engaged_total . load ( Ordering :: Relaxed ) ,
1455+ backpressure_rejected_total : self . backpressure_rejected_total . load ( Ordering :: Relaxed ) ,
1456+ backpressure_force_flush_total : self . backpressure_force_flush_total . load ( Ordering :: Relaxed ) ,
14311457 }
14321458 }
14331459
0 commit comments