-
Notifications
You must be signed in to change notification settings - Fork 730
Description
Is your feature request related to a problem? Please describe.
I was looking through the XOR filter in the hummock folder and found that it stores keys as Vec<(Vec, Filter)>, this causes lots of unnecessary memory reads, as the full key must be decoded and compared.
Describe the solution you'd like
I propose refactoring the block index to use a Structure of Arrays (SoA) layout with Prefix Inlining.
Instead of storing full keys in the primary search vector, we store the first 8 bytes of the key in a contiguous Vec.
New Layout:
struct OptimizedBlockFilter {
// Hot path: Contiguous u64 array. Fits in L3 cache.
block_prefixes: Vec<u64>,
// Cold path: Accessed only for tie-breaking or final verification.
block_keys: Vec<Vec<u8>>,
filters: Vec<Xor16>,
}Algorithm:
Fast Path: Perform partition_point (binary search) strictly on the block_prefixes vector. This is CPU-cache friendly and requires no pointer dereferencing.
Tie-Breaking: If the target prefix matches an entry in the vector, strictly then do we access block_keys (heap) to resolve the collision.
Describe alternatives you've considered
No response
Additional context
the results were that this optimized implementation ran 3.7x faster here, with let num_blocks = 16384, as you increase the number of blocks, the speed difference also increases drastically, upto 14x faster with 163840 blocks, i also thought we could further experiment with Eytzinger layout, and prefetching instructions for a faster speed of about 3-4x possibly. im open to making a pr with these optimizations if this seems like a useful feature to have.
I ran a quick benchmark using criterion which confirms the 3.7x speedup (click to expand).
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use rand::prelude::*;
use std::cmp::Ordering;
use std::ops::Bound;
use byteorder::{ByteOrder, BigEndian};
use std::hint::black_box;
// Hummock keys are usually: [User Key] + [Epoch (8 bytes)]
// The `FullKey` wrapper parses this layout cheaply.
struct FullKey<'a> {
user_key: &'a [u8],
_epoch: u64,
}
impl<'a> FullKey<'a> {
// Mimics risingwave_hummock_sdk::key::FullKey::decode
#[inline(always)]
fn decode(data: &'a [u8]) -> Self {
// Real logic: split off the last 8 bytes for epoch
// We panic on short keys to ensure data validity in bench
if data.len() < 8 {
return FullKey { user_key: data, _epoch: 0 };
}
let (key, epoch_bytes) = data.split_at(data.len() - 8);
FullKey {
user_key: key,
_epoch: BigEndian::read_u64(epoch_bytes),
}
}
}
// Mocking the Xor16 struct to take up memory/cache space
#[derive(Clone)]
struct MockXor16 {
// 32 bytes of metadata + pointer to heap
_fingerprints: Vec<u16>,
}
impl MockXor16 {
fn new() -> Self {
// A typical block has ~100-200 keys.
// 200 keys * 1.23 * 2 bytes = ~500 bytes of fingerprint data.
Self { _fingerprints: vec![0; 250] }
}
#[inline(always)]
fn contains(&self, _h: u64) -> bool {
// Return true to force the benchmark to finish the logic
true
}
}
// Type alias for the range used in may_exist
type UserKeyRangeRef<'a> = (Bound<&'a [u8]>, Bound<&'a [u8]>);
struct BlockBasedXor16Filter {
// The exact layout from the provided code:
// A vector of tuples: (Raw Key Bytes, Filter)
filters: Vec<(Vec<u8>, MockXor16)>,
}
impl BlockBasedXor16Filter {
// Exact logic from source: `may_exist`
fn may_exist(&self, user_key_range: &UserKeyRangeRef<'_>, h: u64) -> bool {
let mut block_idx = match user_key_range.0 {
Bound::Unbounded => 0,
Bound::Included(left) | Bound::Excluded(left) => self
.filters
.partition_point(|(smallest_key, _)| {
// COST 1: Decode the FullKey from raw bytes
// COST 2: Pointer chase to `smallest_key` (heap)
let ord = FullKey::decode(smallest_key).user_key.cmp(left);
ord == Ordering::Less || ord == Ordering::Equal
})
.saturating_sub(1),
};
// Scan forward (usually just 1-2 blocks)
while block_idx < self.filters.len() {
// Simplified bound check logic for benchmark focus
let match_found = self.filters[block_idx].1.contains(h);
if match_found { return true; }
// Check if we passed the range (Simplified)
match user_key_range.1 {
Bound::Included(right) => {
if FullKey::decode(&self.filters[block_idx].0).user_key.cmp(right) == Ordering::Greater {
break;
}
}
_ => {}
}
block_idx += 1;
}
false
}
}
struct OptimizedBlockFilter {
// 1. Contiguous array of u64 prefixes (L3 Cache Resident)
block_prefixes: Vec<u64>,
// 2. Separate arrays for slow-path data (Heap)
block_keys: Vec<Vec<u8>>,
filters: Vec<MockXor16>,
}
impl OptimizedBlockFilter {
fn new(data: Vec<(Vec<u8>, MockXor16)>) -> Self {
let mut block_prefixes = Vec::with_capacity(data.len());
let mut block_keys = Vec::with_capacity(data.len());
let mut filters = Vec::with_capacity(data.len());
for (key, filter) in data {
block_prefixes.push(Self::to_prefix(&key));
block_keys.push(key);
filters.push(filter);
}
Self { block_prefixes, block_keys, filters }
}
#[inline(always)]
fn to_prefix(key: &[u8]) -> u64 {
let mut buf = [0u8; 8];
// We interpret the first 8 bytes of the FULL key (including TableID)
let len = key.len().min(8);
buf[..len].copy_from_slice(&key[..len]);
u64::from_be_bytes(buf)
}
fn may_exist(&self, user_key_range: &UserKeyRangeRef<'_>, h: u64) -> bool {
match user_key_range.0 {
Bound::Unbounded => false, // Simplified
Bound::Included(left) | Bound::Excluded(left) => {
// --- FAST PATH ---
let target_prefix = Self::to_prefix(left);
// 1. Binary Search on u64 (No pointer chasing, no decoding)
let mut idx = self.block_prefixes.partition_point(|&p| p < target_prefix);
// 2. Tie-Breaker (Necessary for correctness)
if idx < self.block_prefixes.len() && self.block_prefixes[idx] == target_prefix {
// Only NOW do we touch the heap and decode
let ord = FullKey::decode(&self.block_keys[idx]).user_key.cmp(left);
if ord == Ordering::Less {
idx += 1;
}
}
let start_idx = idx.saturating_sub(1);
self.check_range(start_idx, user_key_range.1, h)
}
}
}
#[inline(always)]
fn check_range(&self, mut block_idx: usize, right_bound: Bound<&[u8]>, h: u64) -> bool {
while block_idx < self.filters.len() {
// Check if out of bounds (Optimized with prefix!)
if let Bound::Included(right) = right_bound {
let right_prefix = Self::to_prefix(right);
let curr_prefix = self.block_prefixes[block_idx];
if curr_prefix > right_prefix { return false; }
if curr_prefix == right_prefix {
if FullKey::decode(&self.block_keys[block_idx]).user_key.cmp(right) == Ordering::Greater {
return false;
}
}
}
if self.filters[block_idx].contains(h) { return true; }
block_idx += 1;
}
false
}
}
fn generate_hummock_data(block_count: usize) -> (Vec<(Vec<u8>, MockXor16)>, Vec<Vec<u8>>) {
let mut rng = StdRng::seed_from_u64(2024);
let mut blocks = Vec::with_capacity(block_count);
let mut probe_keys = Vec::with_capacity(1000);
// Generate sorted keys simulating [TableID (4B)][UserKey (8B)][Epoch (8B)]
// We use a small number of TableIDs to create prefix collisions, testing the fallback logic.
let mut current_key = 0u64;
for _ in 0..block_count {
// Increment key to ensure sorted order
current_key += rng.random_range(100..1000);
let mut key_bytes = Vec::new();
// Table ID (High bits of our counter)
key_bytes.extend_from_slice(&(current_key >> 32).to_be_bytes()[4..]);
// User Key Body
key_bytes.extend_from_slice(¤t_key.to_be_bytes());
// Epoch (Random, usually decreasing in Hummock, but irrelevant for sort here)
key_bytes.extend_from_slice(&rng.random::<u64>().to_be_bytes());
// Save some keys for probing later
if rng.random_bool(0.01) {
probe_keys.push(key_bytes.clone()); // Hit case
} else if rng.random_bool(0.005) {
// Miss case (slightly tweaked key)
let mut miss = key_bytes.clone();
miss[10] ^= 0xFF;
probe_keys.push(miss);
}
blocks.push((key_bytes, MockXor16::new()));
}
(blocks, probe_keys)
}
fn criterion_benchmark(c: &mut Criterion) {
let num_blocks = 16384;
let (data, probe_full_keys) = generate_hummock_data(num_blocks);
// Prepare Implementations
let baseline = BlockBasedXor16Filter { filters: data.clone() };
let optimized = OptimizedBlockFilter::new(data);
let mut rng = StdRng::seed_from_u64(2024);
// Prepare Probe Set (UserKeyRangeRef)
// In Hummock, we search using the *User Key*, stripping the epoch.
let probes: Vec<(UserKeyRangeRef, u64)> = probe_full_keys.iter().map(|k| {
let decoded = FullKey::decode(k);
// Point lookup range: [Key, Key]
let range = (Bound::Included(decoded.user_key), Bound::Included(decoded.user_key));
let hash = rng.random(); // Dummy hash
(range, hash)
}).collect();
let mut group = c.benchmark_group("Hummock XorFilter Lookup");
group.throughput(Throughput::Elements(probes.len() as u64));
group.bench_function("Baseline (Pointer Chase + FullKey Decode)", |b| {
b.iter(|| {
for (range, h) in &probes {
black_box(baseline.may_exist(range, *h));
}
})
});
group.bench_function("Optimized (Prefix Inlining)", |b| {
b.iter(|| {
for (range, h) in &probes {
black_box(optimized.may_exist(range, *h));
}
})
});
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);