Skip to content

Commit d2077ff

Browse files
j-mendezclaude
andcommitted
perf(spider): bloom filter bitmask addressing + inline early-exit, interner zero-clone extend, bump v2.47.53
Bloom: power-of-2 sizing with bitmask (replaces expensive modulo 7x per op), inline early-exit contains (1-2 bit tests on absent path vs all 7), MurmurHash3 finalizer for h2 decorrelation. Interner: move instead of clone in extend_links, read-only get() instead of get_or_intern() for visited checks, removed double-reference indirection in bloom calls. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 58f1937 commit d2077ff

10 files changed

Lines changed: 128 additions & 106 deletions

File tree

Cargo.lock

Lines changed: 15 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

spider/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider"
3-
version = "2.47.52"
3+
version = "2.47.53"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A web crawler and scraper, building blocks for data curation workloads."
66
repository = "https://github.com/spider-rs/spider"
@@ -121,11 +121,11 @@ features = ["serde", "headers", "dynamic-versions"]
121121

122122
[dependencies.spider_agent_types]
123123
path = "../spider_agent_types"
124-
version = "2.47.52"
124+
version = "2.47.53"
125125

126126
[dependencies.spider_agent]
127127
path = "../spider_agent"
128-
version = "2.47.52"
128+
version = "2.47.53"
129129
optional = true
130130
default-features = false
131131

spider/src/utils/bloom.rs

Lines changed: 78 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ const TARGET_FP: f64 = 0.01;
2424
/// k = -ln(p) / ln(2) ≈ 6.64 → 7
2525
const NUM_HASHES: u32 = 7;
2626

27-
/// Compute optimal bit count for `n` elements at `fp` false-positive rate.
28-
/// m = -n * ln(p) / (ln2)^2
27+
/// Compute optimal bit count for `n` elements at `fp` false-positive rate,
28+
/// rounded up to the next **power of two** so modulo can be replaced with
29+
/// a bitmask (`& mask`).
30+
///
31+
/// m = -n * ln(p) / (ln2)^2, then → next_power_of_two
2932
fn optimal_bits(n: usize, fp: f64) -> usize {
3033
let m = -(n as f64) * fp.ln() / (core::f64::consts::LN_2.powi(2));
31-
// Round up to next multiple of 8 so we address whole bytes.
32-
let m = m.ceil() as usize;
33-
(m + 7) & !7
34+
let m = (m.ceil() as usize).max(64);
35+
// Round to next power of two for bitmask addressing.
36+
m.next_power_of_two()
3437
}
3538

3639
/// Tracks how the backing memory was allocated so `Drop` can free it correctly.
@@ -53,8 +56,8 @@ pub struct MmapBloom {
5356
ptr: *mut u8,
5457
/// Usable length in bytes (= num_bits / 8).
5558
len_bytes: usize,
56-
/// Total number of usable bits (= len_bytes * 8).
57-
num_bits: u64,
59+
/// Bitmask for fast modulo: `num_bits - 1` (num_bits is always power of 2).
60+
mask: u64,
5861
/// Number of elements inserted (approximate — counts every insert call).
5962
count: usize,
6063
/// How the memory was allocated, for correct deallocation.
@@ -83,7 +86,7 @@ impl MmapBloom {
8386
Self {
8487
ptr,
8588
len_bytes,
86-
num_bits: bits as u64,
89+
mask: (bits as u64) - 1,
8790
count: 0,
8891
alloc_kind,
8992
}
@@ -167,69 +170,77 @@ impl MmapBloom {
167170
(ptr, AllocKind::Heap)
168171
}
169172

170-
/// Compute the k bit positions for a given item using double hashing.
171-
/// h_i = h1 + i * h2 (mod num_bits)
172-
#[inline(always)]
173-
fn bit_positions<T: Hash>(&self, item: &T) -> [u64; NUM_HASHES as usize] {
174-
let mut h1_state = ahash::AHasher::default();
175-
item.hash(&mut h1_state);
176-
let h1 = h1_state.finish();
177-
178-
// Second hash: fold and mix.
179-
let h2 = h1
180-
.wrapping_mul(0x517cc1b727220a95)
181-
.wrapping_add(0x6c62272e07bb0142);
182-
183-
let mut positions = [0u64; NUM_HASHES as usize];
184-
for i in 0..NUM_HASHES as u64 {
185-
positions[i as usize] = (h1.wrapping_add(i.wrapping_mul(h2))) % self.num_bits;
186-
}
187-
positions
188-
}
189-
190-
/// Set bit at position `pos`.
191-
#[inline(always)]
192-
fn set_bit(&mut self, pos: u64) {
193-
let byte_idx = (pos / 8) as usize;
194-
let bit_idx = (pos % 8) as u8;
195-
debug_assert!(byte_idx < self.len_bytes);
196-
// SAFETY: `pos < num_bits` (enforced by modulo in `bit_positions`),
197-
// and `num_bits == len_bytes * 8`, so `byte_idx < len_bytes` always holds.
198-
unsafe {
199-
let byte = &mut *self.ptr.add(byte_idx);
200-
*byte |= 1 << bit_idx;
201-
}
202-
}
203-
204-
/// Test bit at position `pos`.
173+
/// Compute double-hash seeds for an item.
174+
///
175+
/// h2 is derived via a MurmurHash3-style finalizer to decorrelate it from
176+
/// h1 — critical for low false-positive rates with power-of-2 masking.
177+
/// The `| 1` forces h2 odd (coprime with any power of 2) so all bit
178+
/// positions are reachable.
205179
#[inline(always)]
206-
fn test_bit(&self, pos: u64) -> bool {
207-
let byte_idx = (pos / 8) as usize;
208-
let bit_idx = (pos % 8) as u8;
209-
debug_assert!(byte_idx < self.len_bytes);
210-
// SAFETY: same invariant as `set_bit`.
211-
unsafe {
212-
let byte = *self.ptr.add(byte_idx);
213-
byte & (1 << bit_idx) != 0
214-
}
180+
fn hash_seeds<T: Hash + ?Sized>(item: &T) -> (u64, u64) {
181+
let mut state = ahash::AHasher::default();
182+
item.hash(&mut state);
183+
let h1 = state.finish();
184+
// MurmurHash3 64-bit finalizer — avalanches all bits.
185+
let mut x = h1;
186+
x ^= x >> 33;
187+
x = x.wrapping_mul(0xff51afd7ed558ccd);
188+
x ^= x >> 33;
189+
x = x.wrapping_mul(0xc4ceb9fe1a85ec53);
190+
x ^= x >> 33;
191+
(h1, x | 1)
215192
}
216193

217194
/// Insert an item into the bloom filter.
195+
///
196+
/// Computes each bit position inline — no intermediate array.
197+
/// Uses enhanced double hashing: h_i = h1 + i*h2 + i*(i-1)/2 to
198+
/// eliminate correlation artefacts with power-of-2 sizing.
218199
#[inline]
219-
pub fn insert<T: Hash>(&mut self, item: &T) {
220-
let positions = self.bit_positions(item);
221-
for &pos in &positions {
222-
self.set_bit(pos);
200+
pub fn insert<T: Hash + ?Sized>(&mut self, item: &T) {
201+
let (h1, h2) = Self::hash_seeds(item);
202+
let mask = self.mask;
203+
let mut composite = h1;
204+
for i in 0..NUM_HASHES as u64 {
205+
let pos = composite & mask;
206+
let byte_idx = (pos >> 3) as usize;
207+
let bit_idx = (pos & 7) as u8;
208+
// SAFETY: pos < num_bits (mask guarantees), num_bits == len_bytes * 8.
209+
unsafe {
210+
let byte = &mut *self.ptr.add(byte_idx);
211+
*byte |= 1 << bit_idx;
212+
}
213+
// Enhanced double hashing: next = h1 + (i+1)*h2 + (i+1)*i/2
214+
// = composite + h2 + i
215+
composite = composite.wrapping_add(h2).wrapping_add(i);
223216
}
224217
self.count += 1;
225218
}
226219

227220
/// Check if an item is probably in the set.
228-
/// Returns `false` only when the item is *definitely* absent.
221+
///
222+
/// Returns `false` as soon as *any* bit is unset — on the common "absent"
223+
/// path this exits after testing only 1-2 bits instead of all 7.
229224
#[inline]
230-
pub fn contains<T: Hash>(&self, item: &T) -> bool {
231-
let positions = self.bit_positions(item);
232-
positions.iter().all(|&pos| self.test_bit(pos))
225+
pub fn contains<T: Hash + ?Sized>(&self, item: &T) -> bool {
226+
let (h1, h2) = Self::hash_seeds(item);
227+
let mask = self.mask;
228+
let mut composite = h1;
229+
for i in 0..NUM_HASHES as u64 {
230+
let pos = composite & mask;
231+
let byte_idx = (pos >> 3) as usize;
232+
let bit_idx = (pos & 7) as u8;
233+
// SAFETY: same invariant as `insert`.
234+
let set = unsafe {
235+
let byte = *self.ptr.add(byte_idx);
236+
byte & (1 << bit_idx) != 0
237+
};
238+
if !set {
239+
return false;
240+
}
241+
composite = composite.wrapping_add(h2).wrapping_add(i);
242+
}
243+
true
233244
}
234245

235246
/// Approximate number of insertions performed.
@@ -295,7 +306,7 @@ impl Drop for MmapBloom {
295306
impl std::fmt::Debug for MmapBloom {
296307
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
297308
f.debug_struct("MmapBloom")
298-
.field("num_bits", &self.num_bits)
309+
.field("num_bits", &(self.mask + 1))
299310
.field("count", &self.count)
300311
.field("size_bytes", &self.len_bytes)
301312
.field("alloc_kind", &self.alloc_kind)
@@ -314,7 +325,7 @@ impl Clone for MmapBloom {
314325
Self {
315326
ptr,
316327
len_bytes: self.len_bytes,
317-
num_bits: self.num_bits,
328+
mask: self.mask,
318329
count: self.count,
319330
alloc_kind,
320331
}
@@ -427,9 +438,9 @@ mod tests {
427438
#[test]
428439
fn test_size_reasonable() {
429440
let bloom = MmapBloom::new(1_000_000);
430-
// For 1M items at 1% FP: ~1.2 MB
441+
// For 1M items at 1% FP: ~1.2 MB optimal, rounded to next power of 2 → 2 MB.
431442
assert!(bloom.size_bytes() > 1_000_000);
432-
assert!(bloom.size_bytes() < 2_000_000);
443+
assert!(bloom.size_bytes() <= 2_097_152); // 2 MiB (16 Mbit)
433444
}
434445

435446
#[test]
@@ -442,9 +453,9 @@ mod tests {
442453
#[test]
443454
fn test_optimal_bits() {
444455
let bits = optimal_bits(1_000_000, 0.01);
445-
// Should be ~9.58M bits ≈ 1.2 MB
446-
assert!(bits > 9_000_000);
447-
assert!(bits < 10_000_000);
456+
// ~9.58M optimal → next power of 2 = 16_777_216 (2^24)
457+
assert!(bits.is_power_of_two());
458+
assert_eq!(bits, 16_777_216);
448459
}
449460

450461
#[test]

0 commit comments

Comments
 (0)