Skip to content

Commit 9edc93c

Browse files
committed
fix(shutdown): one grace-derived budget across all shutdown phases
The three serial shutdown phases (PGWire drain, gRPC drain, buffered-layer flush + cursor snapshot) each had an independent 180s ceiling — assuming 540s of SIGTERM grace when prod's Docker StopGracePeriod is 60s. A hung PGWire drain alone could eat the whole grace, so SIGKILL landed before the flush or the clean cursor snapshot ever started, forcing reconcile + full WAL replay on the next boot (2026-06-11 deploy). TIMEFUSION_STOP_GRACE_SECS (default 50, sized for the 60s prod grace) is now the total budget: drain phases get small caps (20%/10%) so they can't starve the flush, and unused slack flows forward since the buffered layer works off the same absolute deadline via shutdown_by(). The snapshot keeps a reserved 20% slice of whatever remains. Replaces TIMEFUSION_SHUTDOWN_TIMEOUT_SECS; remove it from app envs and set TIMEFUSION_STOP_GRACE_SECS to ~80% of StopGracePeriod if raising the grace.
1 parent eb13a36 commit 9edc93c

3 files changed

Lines changed: 49 additions & 38 deletions

File tree

src/buffered_write_layer.rs

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,23 +1366,31 @@ impl BufferedWriteLayer {
13661366
.await;
13671367
}
13681368

1369-
#[instrument(skip(self))]
1369+
/// Shutdown with the full configured stop grace as the budget. Callers
1370+
/// that already spent part of the grace on earlier drain phases (main.rs)
1371+
/// use `shutdown_by` with the shared absolute deadline instead.
13701372
pub async fn shutdown(&self) -> anyhow::Result<()> {
1373+
self.shutdown_by(tokio::time::Instant::now() + self.config.buffer.stop_grace()).await
1374+
}
1375+
1376+
#[instrument(skip(self))]
1377+
pub async fn shutdown_by(&self, deadline: tokio::time::Instant) -> anyhow::Result<()> {
13711378
info!("BufferedWriteLayer shutdown initiated");
13721379

1373-
// Signal background tasks to stop, then run the rest of shutdown under a
1374-
// deadline that must fit inside the orchestrator's SIGTERM→SIGKILL grace
1375-
// so the clean cursor snapshot below ALWAYS gets written. Anything not
1376-
// flushed in time is durable in the WAL and simply replays (and
1377-
// background-drains) on next boot. Prod 2026-06-12: an unbounded
1378-
// force-flush of a 38GB buffer blew past the grace, was SIGKILLed
1379-
// mid-flush, and left clean_shutdown=false → next boot paid
1380-
// delta_cursor_reconcile + a full blocking replay. Keep
1381-
// TIMEFUSION_SHUTDOWN_TIMEOUT_SECS below the orchestrator grace.
1380+
// Signal background tasks to stop, then run the rest of shutdown by
1381+
// `deadline` — the remainder of the process-wide stop grace, which must
1382+
// fit inside the orchestrator's SIGTERM→SIGKILL window so the clean
1383+
// cursor snapshot below ALWAYS gets written. Anything not flushed in
1384+
// time is durable in the WAL and simply replays (and background-drains)
1385+
// on next boot. Prod 2026-06-12: an unbounded force-flush of a 38GB
1386+
// buffer blew past the grace, was SIGKILLed mid-flush, and left
1387+
// clean_shutdown=false → next boot paid delta_cursor_reconcile + a full
1388+
// blocking replay. Keep TIMEFUSION_STOP_GRACE_SECS below the
1389+
// orchestrator grace (Docker `StopGracePeriod`).
13821390
self.shutdown.cancel();
1383-
let budget = self.config.buffer.compute_shutdown_timeout();
1384-
let flush_deadline = tokio::time::Instant::now() + budget.mul_f32(0.8);
1385-
let hard_deadline = tokio::time::Instant::now() + budget;
1391+
let budget = deadline.saturating_duration_since(tokio::time::Instant::now());
1392+
let flush_deadline = deadline - budget.mul_f32(0.2); // reserve 20% for the snapshot
1393+
let hard_deadline = deadline;
13861394
debug!("Shutdown budget: {:?}", budget);
13871395

13881396
// Wait for background tasks to stop, bounded by the flush deadline.
@@ -1769,7 +1777,7 @@ mod tests {
17691777
let dir = tempdir().unwrap();
17701778
let mut base = AppConfig::default();
17711779
base.core.timefusion_data_dir = dir.path().to_path_buf();
1772-
base.buffer.timefusion_shutdown_timeout_secs = 1; // budget=1s, flush_deadline=0.8s
1780+
base.buffer.timefusion_stop_grace_secs = 1; // budget=1s, flush_deadline=0.8s
17731781
let cfg = Arc::new(base);
17741782
// SAFETY: walrus reads WALRUS_DATA_DIR from process env; #[serial] protects it.
17751783
unsafe { std::env::set_var("WALRUS_DATA_DIR", cfg.core.wal_dir()) };

src/config.rs

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,15 @@ const_default!(d_retention_mins: u64 = 70);
117117
const_default!(d_eviction_interval: u64 = 60);
118118
const_default!(d_buffer_max_memory: usize = 4096);
119119
const_default!(d_wal_shards_per_topic: usize = 4);
120-
// Per-phase ceiling for each serial shutdown step (PGWire drain → gRPC drain →
121-
// BufferedWriteLayer flush). 5s — the previous default — was below realistic
122-
// flush time for any non-trivial MemBuffer and caused the post-deploy WAL
123-
// replay we saw 2026-06-03. The Docker `StopGracePeriod` external cap should
124-
// be set ≥ `3 × this` to give all three phases room.
125-
const_default!(d_shutdown_timeout: u64 = 180);
120+
// Total graceful-shutdown budget shared by ALL serial shutdown phases
121+
// (PGWire drain → gRPC drain → buffered-layer flush + cursor snapshot).
122+
// Set to ~80% of the orchestrator's SIGTERM→SIGKILL grace (Docker/CapRover
123+
// `StopGracePeriod`; prod is 60s) so the clean cursor snapshot always lands
124+
// before SIGKILL — the previous per-phase 180s ceilings assumed 540s of
125+
// grace nobody configured, and PGWire drain alone could eat the real grace
126+
// before the flush or snapshot ever started (2026-06-11 deploy). Anything
127+
// unflushed at the deadline is durable in the WAL and replays on next boot.
128+
const_default!(d_stop_grace: u64 = 50);
126129
const_default!(d_wal_corruption_threshold: usize = 10);
127130
const_default!(d_flush_parallelism: usize = 4);
128131
// Cold-boot Delta cursor reconciliation. R2 happily takes 64+ concurrent
@@ -414,8 +417,8 @@ pub struct BufferConfig {
414417
pub timefusion_eviction_interval_secs: u64,
415418
#[serde(default = "d_buffer_max_memory")]
416419
pub timefusion_buffer_max_memory_mb: usize,
417-
#[serde(default = "d_shutdown_timeout")]
418-
pub timefusion_shutdown_timeout_secs: u64,
420+
#[serde(default = "d_stop_grace")]
421+
pub timefusion_stop_grace_secs: u64,
419422
#[serde(default = "d_wal_corruption_threshold")]
420423
pub timefusion_wal_corruption_threshold: usize,
421424
#[serde(default = "d_flush_parallelism")]
@@ -518,9 +521,9 @@ impl BufferConfig {
518521
Duration::from_secs(self.timefusion_write_backpressure_secs)
519522
}
520523

521-
/// Per-phase shutdown ceiling, in seconds.
522-
pub fn compute_shutdown_timeout(&self) -> Duration {
523-
Duration::from_secs(self.timefusion_shutdown_timeout_secs.max(1))
524+
/// Total graceful-shutdown budget — see `d_stop_grace`.
525+
pub fn stop_grace(&self) -> Duration {
526+
Duration::from_secs(self.timefusion_stop_grace_secs.max(1))
524527
}
525528
}
526529

src/main.rs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -370,29 +370,29 @@ async fn async_main(cfg: &'static AppConfig) -> anyhow::Result<()> {
370370
// 2. Once gRPC is done, the buffered layer no longer receives new
371371
// writes — safe to flush + checkpoint.
372372
// 3. Shut down database (cache, foyer, log store).
373+
// One shutdown budget shared by all serial phases (TIMEFUSION_STOP_GRACE_SECS,
374+
// sized to fit the orchestrator's SIGTERM→SIGKILL grace). The drain phases
375+
// get small caps so a hung connection can't starve the buffer flush +
376+
// cursor snapshot — the phase that determines next-boot cost; their unused
377+
// slack flows forward automatically because the buffered layer works off
378+
// the same absolute deadline.
379+
let grace = cfg.buffer.stop_grace();
380+
let deadline = tokio::time::Instant::now() + grace;
373381
pgwire_shutdown.cancel();
374-
let pgwire_drain_deadline = Duration::from_secs(cfg.buffer.timefusion_shutdown_timeout_secs.max(5));
375-
match tokio::time::timeout(pgwire_drain_deadline, pg_task).await {
382+
match tokio::time::timeout(grace.mul_f32(0.2), pg_task).await {
376383
Ok(Ok(())) => info!("PGWire drained cleanly"),
377384
Ok(Err(e)) => error!("PGWire task panicked during drain: {}", e),
378-
Err(_) => warn!(
379-
"PGWire drain exceeded {}s — proceeding with flush; some in-flight queries may be reset",
380-
pgwire_drain_deadline.as_secs()
381-
),
385+
Err(_) => warn!("PGWire drain exceeded its slice of the stop grace — proceeding; in-flight queries may be reset"),
382386
}
383387

384388
grpc_shutdown.cancel();
385-
let grpc_drain_deadline = Duration::from_secs(cfg.buffer.timefusion_shutdown_timeout_secs.max(5));
386-
match tokio::time::timeout(grpc_drain_deadline, grpc_task).await {
389+
match tokio::time::timeout(grace.mul_f32(0.1), grpc_task).await {
387390
Ok(Ok(())) => info!("gRPC drained cleanly"),
388391
Ok(Err(e)) => error!("gRPC task panicked during drain: {}", e),
389-
Err(_) => error!(
390-
"gRPC drain exceeded {}s — forcing shutdown; in-flight requests may be reset",
391-
grpc_drain_deadline.as_secs()
392-
),
392+
Err(_) => error!("gRPC drain exceeded its slice of the stop grace — proceeding; in-flight requests may be reset"),
393393
}
394394

395-
if let Err(e) = buffered_layer_for_shutdown.shutdown().await {
395+
if let Err(e) = buffered_layer_for_shutdown.shutdown_by(deadline).await {
396396
error!("Error during buffered layer shutdown: {}", e);
397397
}
398398
sleep(Duration::from_millis(500)).await;

0 commit comments

Comments
 (0)