Skip to content

Commit 685af19

Browse files
authored
Reduce fsyncs and write to vlog and bplustree during compaction (#359)
1 parent e20cbf9 commit 685af19

15 files changed

Lines changed: 1787 additions & 1730 deletions

Cargo.lock

Lines changed: 2 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ async-trait = "0.1.82"
3131
chrono = "0.4.43"
3232
guardian = "1.3.0"
3333
scopeguard = "1.2.0"
34-
memmap2 = "0.9.5"
3534
log = "0.4.28"
3635
lz4_flex = "0.12"
3736
crossbeam-skiplist = "0.1.3"

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ let tree = TreeBuilder::with_options(opts).build()?;
148148

149149
**Note:** Versioning requires VLog to be enabled. When you call `with_versioning(true, retention_ns)`, VLog is automatically enabled and configured appropriately.
150150

151+
**Important:** When versioning is enabled without the B+tree index, timestamps inserted "back in time" (earlier than existing timestamps) will not be read correctly. This is because the LSM tree orders entries by user key ascending and sequence number descending, not by timestamp.
152+
153+
If you need to insert historical data with earlier timestamps, enable the B+tree versioned index with `with_versioned_index(true)`. The B+tree allows in-place updates and correctly handles out-of-order timestamp inserts.
154+
151155
## Transaction Operations
152156

153157
### Basic Operations

docs/ARCHITECTURE.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,15 @@ The motivation for the optional B+tree index:
587587
| Fast point and range queries for history | Insert performance slows during LSM writes |
588588
| All versions of a key are contiguous | Every LSM write also updates B+tree |
589589

590+
**Timestamp Ordering Limitation:**
591+
592+
When using LSM-only versioning (without B+tree index), timestamps inserted "back in time" will not be read correctly. This occurs because:
593+
- LSM orders entries by `(user_key ASC, seq_num DESC)`
594+
- Point-in-time queries (`get_at`) find the first entry with `seq_num <= snapshot_seq` where `timestamp <= query_timestamp`
595+
- A later-inserted entry with an earlier timestamp will have a higher sequence number, causing it to be returned instead of the correct historical value
596+
597+
To support out-of-order timestamp inserts, enable the B+tree index with `with_versioned_index(true)`. The B+tree stores entries sorted by `(user_key, timestamp)` and supports in-place updates, correctly handling historical data insertion.
598+
590599
**Read-after-Write Consistency:**
591600

592601
Currently, the B+tree index is updated synchronously during LSM writes, providing read-after-write consistency for versioned queries. A recently written version is immediately visible via the `history()` API.

src/batch.rs

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -156,17 +156,6 @@ impl Batch {
156156
self.add_record_internal(kind, key, value, None, timestamp)
157157
}
158158

159-
pub(crate) fn add_record_with_valueptr(
160-
&mut self,
161-
kind: InternalKeyKind,
162-
key: Key,
163-
value: Option<Value>,
164-
valueptr: Option<ValuePointer>,
165-
timestamp: u64,
166-
) -> Result<()> {
167-
self.add_record_internal(kind, key, value, valueptr, timestamp)
168-
}
169-
170159
pub(crate) fn count(&self) -> u32 {
171160
self.entries.len() as u32
172161
}

src/bplustree/tree.rs

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3498,7 +3498,7 @@ mod tests {
34983498
use test_log::test;
34993499

35003500
use super::*;
3501-
use crate::{InternalKey, InternalKeyKind};
3501+
use crate::{BytewiseComparator, InternalKey, InternalKeyKind, TimestampComparator};
35023502

35033503
#[derive(Clone)]
35043504
struct TestComparator;
@@ -6166,4 +6166,51 @@ mod tests {
61666166
assert_eq!(iter2.key().user_key(), b"key08");
61676167
assert_eq!(iter1.key().user_key(), b"key01"); // Unchanged
61686168
}
6169+
6170+
#[test]
6171+
fn test_full_range_scan_with_timestamp_comparator() {
6172+
let file = NamedTempFile::new().unwrap();
6173+
let cmp = Arc::new(TimestampComparator::new(Arc::new(BytewiseComparator::default())));
6174+
let mut tree = BPlusTree::disk(file.path(), cmp).unwrap();
6175+
6176+
// Insert entries out of timestamp order to prove the comparator sorts correctly.
6177+
// TimestampComparator orders: user_key ASC, timestamp DESC.
6178+
let entries = [
6179+
(b"alpha".to_vec(), 300u64),
6180+
(b"alpha".to_vec(), 100),
6181+
(b"alpha".to_vec(), 200),
6182+
(b"beta".to_vec(), 500),
6183+
(b"beta".to_vec(), 400),
6184+
(b"gamma".to_vec(), 600),
6185+
];
6186+
6187+
for (user_key, ts) in &entries {
6188+
let key = InternalKey::new(user_key.clone(), *ts, InternalKeyKind::Set, *ts).encode();
6189+
tree.insert(key, format!("{}-{}", String::from_utf8_lossy(user_key), ts).into_bytes())
6190+
.unwrap();
6191+
}
6192+
6193+
// Full range scan using the empty-slice pattern
6194+
let empty: &[u8] = &[];
6195+
let iter = tree.range(empty..).unwrap();
6196+
let results: Vec<(Vec<u8>, u64)> = iter
6197+
.map(|entry| {
6198+
let (k, _v) = entry.unwrap();
6199+
let ikey = InternalKey::decode(&k);
6200+
(ikey.user_key.clone(), ikey.timestamp)
6201+
})
6202+
.collect();
6203+
6204+
// Expected: user_key ASC, timestamp DESC within each user_key
6205+
let expected: Vec<(Vec<u8>, u64)> = vec![
6206+
(b"alpha".to_vec(), 300),
6207+
(b"alpha".to_vec(), 200),
6208+
(b"alpha".to_vec(), 100),
6209+
(b"beta".to_vec(), 500),
6210+
(b"beta".to_vec(), 400),
6211+
(b"gamma".to_vec(), 600),
6212+
];
6213+
6214+
assert_eq!(results, expected);
6215+
}
61696216
}

0 commit comments

Comments
 (0)