fix(shutdown): one grace-derived budget across all shutdown phases

tonyalaribe · tonyalaribe · commit 9edc93c1ac2d · 2026-06-12T19:52:55.000+02:00
The three serial shutdown phases (PGWire drain, gRPC drain, buffered-layer
flush + cursor snapshot) each had an independent 180s ceiling — assuming
540s of SIGTERM grace when prod's Docker StopGracePeriod is 60s. A hung
PGWire drain alone could eat the whole grace, so SIGKILL landed before the
flush or the clean cursor snapshot ever started, forcing reconcile + full
WAL replay on the next boot (2026-06-11 deploy).

TIMEFUSION_STOP_GRACE_SECS (default 50, sized for the 60s prod grace) is
now the total budget: drain phases get small caps (20%/10%) so they can't
starve the flush, and unused slack flows forward since the buffered layer
works off the same absolute deadline via shutdown_by(). The snapshot keeps
a reserved 20% slice of whatever remains.

Replaces TIMEFUSION_SHUTDOWN_TIMEOUT_SECS; remove it from app envs and set
TIMEFUSION_STOP_GRACE_SECS to ~80% of StopGracePeriod if raising the grace.
diff --git a/src/buffered_write_layer.rs b/src/buffered_write_layer.rs
@@ -1366,23 +1366,31 @@ impl BufferedWriteLayer {
         .await;
     }
 
-    #[instrument(skip(self))]
+    /// Shutdown with the full configured stop grace as the budget. Callers
+    /// that already spent part of the grace on earlier drain phases (main.rs)
+    /// use `shutdown_by` with the shared absolute deadline instead.
     pub async fn shutdown(&self) -> anyhow::Result<()> {
+        self.shutdown_by(tokio::time::Instant::now() + self.config.buffer.stop_grace()).await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn shutdown_by(&self, deadline: tokio::time::Instant) -> anyhow::Result<()> {
         info!("BufferedWriteLayer shutdown initiated");
 
-        // Signal background tasks to stop, then run the rest of shutdown under a
-        // deadline that must fit inside the orchestrator's SIGTERM→SIGKILL grace
-        // so the clean cursor snapshot below ALWAYS gets written. Anything not
-        // flushed in time is durable in the WAL and simply replays (and
-        // background-drains) on next boot. Prod 2026-06-12: an unbounded
-        // force-flush of a 38GB buffer blew past the grace, was SIGKILLed
-        // mid-flush, and left clean_shutdown=false → next boot paid
-        // delta_cursor_reconcile + a full blocking replay. Keep
-        // TIMEFUSION_SHUTDOWN_TIMEOUT_SECS below the orchestrator grace.
+        // Signal background tasks to stop, then run the rest of shutdown by
+        // `deadline` — the remainder of the process-wide stop grace, which must
+        // fit inside the orchestrator's SIGTERM→SIGKILL window so the clean
+        // cursor snapshot below ALWAYS gets written. Anything not flushed in
+        // time is durable in the WAL and simply replays (and background-drains)
+        // on next boot. Prod 2026-06-12: an unbounded force-flush of a 38GB
+        // buffer blew past the grace, was SIGKILLed mid-flush, and left
+        // clean_shutdown=false → next boot paid delta_cursor_reconcile + a full
+        // blocking replay. Keep TIMEFUSION_STOP_GRACE_SECS below the
+        // orchestrator grace (Docker `StopGracePeriod`).
         self.shutdown.cancel();
-        let budget = self.config.buffer.compute_shutdown_timeout();
-        let flush_deadline = tokio::time::Instant::now() + budget.mul_f32(0.8);
-        let hard_deadline = tokio::time::Instant::now() + budget;
+        let budget = deadline.saturating_duration_since(tokio::time::Instant::now());
+        let flush_deadline = deadline - budget.mul_f32(0.2); // reserve 20% for the snapshot
+        let hard_deadline = deadline;
         debug!("Shutdown budget: {:?}", budget);
 
         // Wait for background tasks to stop, bounded by the flush deadline.
@@ -1769,7 +1777,7 @@ mod tests {
         let dir = tempdir().unwrap();
         let mut base = AppConfig::default();
         base.core.timefusion_data_dir = dir.path().to_path_buf();
-        base.buffer.timefusion_shutdown_timeout_secs = 1; // budget=1s, flush_deadline=0.8s
+        base.buffer.timefusion_stop_grace_secs = 1; // budget=1s, flush_deadline=0.8s
         let cfg = Arc::new(base);
         // SAFETY: walrus reads WALRUS_DATA_DIR from process env; #[serial] protects it.
         unsafe { std::env::set_var("WALRUS_DATA_DIR", cfg.core.wal_dir()) };
diff --git a/src/config.rs b/src/config.rs
@@ -117,12 +117,15 @@ const_default!(d_retention_mins: u64 = 70);
 const_default!(d_eviction_interval: u64 = 60);
 const_default!(d_buffer_max_memory: usize = 4096);
 const_default!(d_wal_shards_per_topic: usize = 4);
-// Per-phase ceiling for each serial shutdown step (PGWire drain → gRPC drain →
-// BufferedWriteLayer flush). 5s — the previous default — was below realistic
-// flush time for any non-trivial MemBuffer and caused the post-deploy WAL
-// replay we saw 2026-06-03. The Docker `StopGracePeriod` external cap should
-// be set ≥ `3 × this` to give all three phases room.
-const_default!(d_shutdown_timeout: u64 = 180);
+// Total graceful-shutdown budget shared by ALL serial shutdown phases
+// (PGWire drain → gRPC drain → buffered-layer flush + cursor snapshot).
+// Set to ~80% of the orchestrator's SIGTERM→SIGKILL grace (Docker/CapRover
+// `StopGracePeriod`; prod is 60s) so the clean cursor snapshot always lands
+// before SIGKILL — the previous per-phase 180s ceilings assumed 540s of
+// grace nobody configured, and PGWire drain alone could eat the real grace
+// before the flush or snapshot ever started (2026-06-11 deploy). Anything
+// unflushed at the deadline is durable in the WAL and replays on next boot.
+const_default!(d_stop_grace: u64 = 50);
 const_default!(d_wal_corruption_threshold: usize = 10);
 const_default!(d_flush_parallelism: usize = 4);
 // Cold-boot Delta cursor reconciliation. R2 happily takes 64+ concurrent
@@ -414,8 +417,8 @@ pub struct BufferConfig {
     pub timefusion_eviction_interval_secs:   u64,
     #[serde(default = "d_buffer_max_memory")]
     pub timefusion_buffer_max_memory_mb:     usize,
-    #[serde(default = "d_shutdown_timeout")]
-    pub timefusion_shutdown_timeout_secs:    u64,
+    #[serde(default = "d_stop_grace")]
+    pub timefusion_stop_grace_secs:          u64,
     #[serde(default = "d_wal_corruption_threshold")]
     pub timefusion_wal_corruption_threshold: usize,
     #[serde(default = "d_flush_parallelism")]
@@ -518,9 +521,9 @@ impl BufferConfig {
         Duration::from_secs(self.timefusion_write_backpressure_secs)
     }
 
-    /// Per-phase shutdown ceiling, in seconds.
-    pub fn compute_shutdown_timeout(&self) -> Duration {
-        Duration::from_secs(self.timefusion_shutdown_timeout_secs.max(1))
+    /// Total graceful-shutdown budget — see `d_stop_grace`.
+    pub fn stop_grace(&self) -> Duration {
+        Duration::from_secs(self.timefusion_stop_grace_secs.max(1))
     }
 }
 
diff --git a/src/main.rs b/src/main.rs
@@ -370,29 +370,29 @@ async fn async_main(cfg: &'static AppConfig) -> anyhow::Result<()> {
     // 2. Once gRPC is done, the buffered layer no longer receives new
     //    writes — safe to flush + checkpoint.
     // 3. Shut down database (cache, foyer, log store).
+    // One shutdown budget shared by all serial phases (TIMEFUSION_STOP_GRACE_SECS,
+    // sized to fit the orchestrator's SIGTERM→SIGKILL grace). The drain phases
+    // get small caps so a hung connection can't starve the buffer flush +
+    // cursor snapshot — the phase that determines next-boot cost; their unused
+    // slack flows forward automatically because the buffered layer works off
+    // the same absolute deadline.
+    let grace = cfg.buffer.stop_grace();
+    let deadline = tokio::time::Instant::now() + grace;
     pgwire_shutdown.cancel();
-    let pgwire_drain_deadline = Duration::from_secs(cfg.buffer.timefusion_shutdown_timeout_secs.max(5));
-    match tokio::time::timeout(pgwire_drain_deadline, pg_task).await {
+    match tokio::time::timeout(grace.mul_f32(0.2), pg_task).await {
         Ok(Ok(())) => info!("PGWire drained cleanly"),
         Ok(Err(e)) => error!("PGWire task panicked during drain: {}", e),
-        Err(_) => warn!(
-            "PGWire drain exceeded {}s — proceeding with flush; some in-flight queries may be reset",
-            pgwire_drain_deadline.as_secs()
-        ),
+        Err(_) => warn!("PGWire drain exceeded its slice of the stop grace — proceeding; in-flight queries may be reset"),
     }
 
     grpc_shutdown.cancel();
-    let grpc_drain_deadline = Duration::from_secs(cfg.buffer.timefusion_shutdown_timeout_secs.max(5));
-    match tokio::time::timeout(grpc_drain_deadline, grpc_task).await {
+    match tokio::time::timeout(grace.mul_f32(0.1), grpc_task).await {
         Ok(Ok(())) => info!("gRPC drained cleanly"),
         Ok(Err(e)) => error!("gRPC task panicked during drain: {}", e),
-        Err(_) => error!(
-            "gRPC drain exceeded {}s — forcing shutdown; in-flight requests may be reset",
-            grpc_drain_deadline.as_secs()
-        ),
+        Err(_) => error!("gRPC drain exceeded its slice of the stop grace — proceeding; in-flight requests may be reset"),
     }
 
-    if let Err(e) = buffered_layer_for_shutdown.shutdown().await {
+    if let Err(e) = buffered_layer_for_shutdown.shutdown_by(deadline).await {
         error!("Error during buffered layer shutdown: {}", e);
     }
     sleep(Duration::from_millis(500)).await;