qcow2-rs: warn instead of panic when dropping dirty cache entries

sandrewh · sandrewh · commit 56533873e19d · 2026-05-13T20:46:19.000-05:00
AsyncLruCacheEntryInner's Drop previously had assert!(!self.is_dirty()),
which turned a missed flush_meta() into a panic at drop time. Panicking
in Drop is hostile in async code: during tokio runtime teardown (or
while a different panic is already unwinding) it escalates to process
abort, and even in the graceful case the panic has no async backtrace
so the bug is harder to investigate than a logged warning would be.

The data loss is unavoidable — once Drop runs we can't recover the
writes the caller didn't flush. The new behavior is to log at WARN
level naming the entry type (L2Table or RefBlock via type_name) and
continue. Operators still see the missed-flush bug from logs; they
just don't lose their process to a Drop panic.

Assisted-by: Claude Opus 4.7 (1M context)
diff --git a/src/cache.rs b/src/cache.rs
@@ -219,7 +219,28 @@ impl<V> AsyncLruCacheEntryInner<V> {
 }
 
 impl<V> Drop for AsyncLruCacheEntryInner<V> {
+    /// A dirty cache entry at drop time means the caller dropped the
+    /// `Qcow2Dev` without first calling `flush_meta()` — the in-memory
+    /// modifications never reached disk. The previous behavior was to
+    /// `assert!` here, which turns the silent data loss into a panic
+    /// during drop. Panicking in `Drop` is hostile in async contexts:
+    /// during runtime teardown (or while a different panic is already
+    /// unwinding) it escalates to process abort, and even in the
+    /// graceful case it makes the missed-flush bug harder to investigate
+    /// because the panic message has no async backtrace.
+    ///
+    /// Logging at WARN level surfaces the same diagnostic without
+    /// stealing control of the unwind. The data loss is already done
+    /// by the time we get here; the user's bug is "didn't flush", and
+    /// they need to find it from the warning + their own backtrace,
+    /// not from a Drop-time panic.
     fn drop(&mut self) {
-        assert!(!self.is_dirty());
+        if self.is_dirty() {
+            log::warn!(
+                "AsyncLruCacheEntryInner<{}> dropped with dirty=true; \
+                 modifications were not flushed to disk (missing flush_meta?)",
+                std::any::type_name::<V>(),
+            );
+        }
     }
 }
diff --git a/tests/drop.rs b/tests/drop.rs
@@ -0,0 +1,108 @@
+//! `AsyncLruCacheEntryInner::drop` behavior on dirty cache entries.
+//!
+//! Verifies that dropping a `Qcow2Dev` whose internal LRU caches still
+//! have dirty entries does not panic. Previously this asserted, which
+//! turned a silent missed-flush bug into a hostile drop-time panic
+//! (and during async runtime teardown, a process abort).
+//!
+//! The new behavior is to log at WARN level and continue. The data is
+//! still lost — we cannot recover from `Drop` — but the program does
+//! not abort, callers above us in the stack can finish cleanly, and
+//! the operator finds the missing-flush bug from the WARN message
+//! rather than from a panic with no async backtrace.
+//!
+//! These tests run on every platform; no cfg gates needed.
+
+#[cfg(test)]
+mod drop_behavior {
+    use qcow2_rs::dev::*;
+    use qcow2_rs::helpers::Qcow2IoBuf;
+    use qcow2_rs::qcow2_default_params;
+    use qcow2_rs::utils::{make_temp_qcow2_img, qcow2_setup_dev_tokio};
+    use tokio::runtime::Runtime;
+
+    const CLUSTER_BITS: usize = 16;
+    const CLUSTER_SIZE: usize = 1 << CLUSTER_BITS;
+
+    fn nonzero_buf(len: usize, pattern: u8) -> Qcow2IoBuf<u8> {
+        let mut buf = Qcow2IoBuf::<u8>::new(len);
+        for b in &mut buf[..] {
+            *b = pattern;
+        }
+        buf
+    }
+
+    /// T1 — happy path: write, flush_meta, drop. The flush clears all
+    /// dirty bits before drop, so the drop path takes the no-warn
+    /// branch. This is the existing canonical usage pattern and the
+    /// regression anchor that the new Drop impl doesn't break it.
+    #[test]
+    fn flush_meta_then_drop_is_clean() {
+        let rt = Runtime::new().unwrap();
+        rt.block_on(async {
+            let virt_size = 1 << 20;
+            let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
+            let path = img.path().to_path_buf();
+            let params = qcow2_default_params!(false, false);
+            let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
+
+            let buf = nonzero_buf(CLUSTER_SIZE, 0x55);
+            dev.write_at(&buf, 0).await.unwrap();
+            dev.flush_meta().await.unwrap();
+
+            // dev drops here at end of block. Cache entries are clean
+            // (flush_meta cleared dirty), so the drop is a no-op.
+        });
+    }
+
+    /// T2 — dirty-drop path: write, SKIP flush_meta, drop. Previously
+    /// this hit `assert!(!self.is_dirty())` inside the LRU cache entry's
+    /// Drop and the test would panic. With the new behavior the drop
+    /// emits a `log::warn!` and continues; the test framework's normal
+    /// panic-catching reports nothing wrong, so the test passes.
+    ///
+    /// The data on disk is still lost (drop can't recover writes), but
+    /// the program doesn't abort. That's the contract we're testing:
+    /// "missing-flush is a warning, not a panic."
+    #[test]
+    fn dirty_drop_does_not_panic() {
+        let rt = Runtime::new().unwrap();
+        rt.block_on(async {
+            let virt_size = 1 << 20;
+            let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
+            let path = img.path().to_path_buf();
+            let params = qcow2_default_params!(false, false);
+            let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
+
+            let buf = nonzero_buf(CLUSTER_SIZE, 0x99);
+            dev.write_at(&buf, 0).await.unwrap();
+            // Deliberately skip flush_meta — cache entries stay dirty.
+
+            // dev drops here at end of block. The Drop impl logs a warn
+            // about the dirty entries; the test must complete without
+            // panicking.
+        });
+    }
+
+    /// T3 — repeated dirty drops in the same process work. Verifies the
+    /// new Drop behavior is stateless (no global flag that could trip
+    /// on the second invocation) and that running this test alongside
+    /// other tests in the same binary is safe.
+    #[test]
+    fn repeated_dirty_drops_are_independent() {
+        let rt = Runtime::new().unwrap();
+        rt.block_on(async {
+            for round in 0..3 {
+                let virt_size = 1 << 20;
+                let img = make_temp_qcow2_img(virt_size, CLUSTER_BITS, 4);
+                let path = img.path().to_path_buf();
+                let params = qcow2_default_params!(false, false);
+                let dev = qcow2_setup_dev_tokio(&path, &params).await.unwrap();
+
+                let buf = nonzero_buf(CLUSTER_SIZE, 0x10 + round as u8);
+                dev.write_at(&buf, 0).await.unwrap();
+                // Skip flush_meta on every iteration.
+            }
+        });
+    }
+}