Skip to content

Commit 5653387

Browse files
committed
qcow2-rs: warn instead of panic when dropping dirty cache entries
AsyncLruCacheEntryInner's Drop previously had assert!(!self.is_dirty()), which turned a missed flush_meta() into a panic at drop time. Panicking in Drop is hostile in async code: during tokio runtime teardown (or while a different panic is already unwinding) it escalates to process abort, and even in the graceful case the panic has no async backtrace so the bug is harder to investigate than a logged warning would be. The data loss is unavoidable — once Drop runs we can't recover the writes the caller didn't flush. The new behavior is to log at WARN level naming the entry type (L2Table or RefBlock via type_name) and continue. Operators still see the missed-flush bug from logs; they just don't lose their process to a Drop panic. Assisted-by: Claude Opus 4.7 (1M context)
1 parent 57a11d4 commit 5653387

2 files changed

Lines changed: 130 additions & 1 deletion

File tree

src/cache.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,28 @@ impl<V> AsyncLruCacheEntryInner<V> {
219219
}
220220

221221
impl<V> Drop for AsyncLruCacheEntryInner<V> {
222+
/// A dirty cache entry at drop time means the caller dropped the
223+
/// `Qcow2Dev` without first calling `flush_meta()` — the in-memory
224+
/// modifications never reached disk. The previous behavior was to
225+
/// `assert!` here, which turns the silent data loss into a panic
226+
/// during drop. Panicking in `Drop` is hostile in async contexts:
227+
/// during runtime teardown (or while a different panic is already
228+
/// unwinding) it escalates to process abort, and even in the
229+
/// graceful case it makes the missed-flush bug harder to investigate
230+
/// because the panic message has no async backtrace.
231+
///
232+
/// Logging at WARN level surfaces the same diagnostic without
233+
/// stealing control of the unwind. The data loss is already done
234+
/// by the time we get here; the user's bug is "didn't flush", and
235+
/// they need to find it from the warning + their own backtrace,
236+
/// not from a Drop-time panic.
222237
fn drop(&mut self) {
223-
assert!(!self.is_dirty());
238+
if self.is_dirty() {
239+
log::warn!(
240+
"AsyncLruCacheEntryInner<{}> dropped with dirty=true; \
241+
modifications were not flushed to disk (missing flush_meta?)",
242+
std::any::type_name::<V>(),
243+
);
244+
}
224245
}
225246
}

tests/drop.rs

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//! `AsyncLruCacheEntryInner::drop` behavior on dirty cache entries.
2+
//!
3+
//! Verifies that dropping a `Qcow2Dev` whose internal LRU caches still
4+
//! have dirty entries does not panic. Previously this asserted, which
5+
//! turned a silent missed-flush bug into a hostile drop-time panic
6+
//! (and during async runtime teardown, a process abort).
7+
//!
8+
//! The new behavior is to log at WARN level and continue. The data is
9+
//! still lost — we cannot recover from `Drop` — but the program does
10+
//! not abort, callers above us in the stack can finish cleanly, and
11+
//! the operator finds the missing-flush bug from the WARN message
12+
//! rather than from a panic with no async backtrace.
13+
//!
14+
//! These tests run on every platform; no cfg gates needed.
15+
16+
#[cfg(test)]
17+
mod drop_behavior {
18+
use qcow2_rs::dev::*;
19+
use qcow2_rs::helpers::Qcow2IoBuf;
20+
use qcow2_rs::qcow2_default_params;
21+
use qcow2_rs::utils::{make_temp_qcow2_img, qcow2_setup_dev_tokio};
22+
use tokio::runtime::Runtime;
23+
24+
const CLUSTER_BITS: usize = 16;
25+
const CLUSTER_SIZE: usize = 1 << CLUSTER_BITS;
26+
27+
fn nonzero_buf(len: usize, pattern: u8) -> Qcow2IoBuf<u8> {
28+
let mut buf = Qcow2IoBuf::<u8>::new(len);
29+
for b in &mut buf[..] {
30+
*b = pattern;
31+
}
32+
buf
33+
}
34+
35+
/// T1 — happy path: write, flush_meta, drop. The flush clears all
36+
/// dirty bits before drop, so the drop path takes the no-warn
37+
/// branch. This is the existing canonical usage pattern and the
38+
/// regression anchor that the new Drop impl doesn't break it.
39+
#[test]
40+
fn flush_meta_then_drop_is_clean() {
41+
let rt = Runtime::new().unwrap();
42+
rt.block_on(async {
43+
let virt_size = 1 << 20;
44+
let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
45+
let path = img.path().to_path_buf();
46+
let params = qcow2_default_params!(false, false);
47+
let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
48+
49+
let buf = nonzero_buf(CLUSTER_SIZE, 0x55);
50+
dev.write_at(&buf, 0).await.unwrap();
51+
dev.flush_meta().await.unwrap();
52+
53+
// dev drops here at end of block. Cache entries are clean
54+
// (flush_meta cleared dirty), so the drop is a no-op.
55+
});
56+
}
57+
58+
/// T2 — dirty-drop path: write, SKIP flush_meta, drop. Previously
59+
/// this hit `assert!(!self.is_dirty())` inside the LRU cache entry's
60+
/// Drop and the test would panic. With the new behavior the drop
61+
/// emits a `log::warn!` and continues; the test framework's normal
62+
/// panic-catching reports nothing wrong, so the test passes.
63+
///
64+
/// The data on disk is still lost (drop can't recover writes), but
65+
/// the program doesn't abort. That's the contract we're testing:
66+
/// "missing-flush is a warning, not a panic."
67+
#[test]
68+
fn dirty_drop_does_not_panic() {
69+
let rt = Runtime::new().unwrap();
70+
rt.block_on(async {
71+
let virt_size = 1 << 20;
72+
let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
73+
let path = img.path().to_path_buf();
74+
let params = qcow2_default_params!(false, false);
75+
let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
76+
77+
let buf = nonzero_buf(CLUSTER_SIZE, 0x99);
78+
dev.write_at(&buf, 0).await.unwrap();
79+
// Deliberately skip flush_meta — cache entries stay dirty.
80+
81+
// dev drops here at end of block. The Drop impl logs a warn
82+
// about the dirty entries; the test must complete without
83+
// panicking.
84+
});
85+
}
86+
87+
/// T3 — repeated dirty drops in the same process work. Verifies the
88+
/// new Drop behavior is stateless (no global flag that could trip
89+
/// on the second invocation) and that running this test alongside
90+
/// other tests in the same binary is safe.
91+
#[test]
92+
fn repeated_dirty_drops_are_independent() {
93+
let rt = Runtime::new().unwrap();
94+
rt.block_on(async {
95+
for round in 0..3 {
96+
let virt_size = 1 << 20;
97+
let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
98+
let path = img.path().to_path_buf();
99+
let params = qcow2_default_params!(false, false);
100+
let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
101+
102+
let buf = nonzero_buf(CLUSTER_SIZE, 0x10 + round as u8);
103+
dev.write_at(&buf, 0).await.unwrap();
104+
// Skip flush_meta on every iteration.
105+
}
106+
});
107+
}
108+
}

0 commit comments

Comments
 (0)