@@ -24,13 +24,16 @@ const TARGET_FP: f64 = 0.01;
2424/// k = -ln(p) / ln(2) ≈ 6.64 → 7
2525const NUM_HASHES : u32 = 7 ;
2626
27- /// Compute optimal bit count for `n` elements at `fp` false-positive rate.
28- /// m = -n * ln(p) / (ln2)^2
27+ /// Compute optimal bit count for `n` elements at `fp` false-positive rate,
28+ /// rounded up to the next **power of two** so modulo can be replaced with
29+ /// a bitmask (`& mask`).
30+ ///
31+ /// m = -n * ln(p) / (ln2)^2, then → next_power_of_two
2932fn optimal_bits ( n : usize , fp : f64 ) -> usize {
3033 let m = -( n as f64 ) * fp. ln ( ) / ( core:: f64:: consts:: LN_2 . powi ( 2 ) ) ;
31- // Round up to next multiple of 8 so we address whole bytes.
32- let m = m . ceil ( ) as usize ;
33- ( m + 7 ) & ! 7
34+ let m = ( m . ceil ( ) as usize ) . max ( 64 ) ;
35+ // Round to next power of two for bitmask addressing.
36+ m . next_power_of_two ( )
3437}
3538
3639/// Tracks how the backing memory was allocated so `Drop` can free it correctly.
@@ -53,8 +56,8 @@ pub struct MmapBloom {
5356 ptr : * mut u8 ,
5457 /// Usable length in bytes (= num_bits / 8).
5558 len_bytes : usize ,
56- /// Total number of usable bits (= len_bytes * 8 ).
57- num_bits : u64 ,
59+ /// Bitmask for fast modulo: `num_bits - 1` (num_bits is always power of 2 ).
60+ mask : u64 ,
5861 /// Number of elements inserted (approximate — counts every insert call).
5962 count : usize ,
6063 /// How the memory was allocated, for correct deallocation.
@@ -83,7 +86,7 @@ impl MmapBloom {
8386 Self {
8487 ptr,
8588 len_bytes,
86- num_bits : bits as u64 ,
89+ mask : ( bits as u64 ) - 1 ,
8790 count : 0 ,
8891 alloc_kind,
8992 }
@@ -167,69 +170,77 @@ impl MmapBloom {
167170 ( ptr, AllocKind :: Heap )
168171 }
169172
170- /// Compute the k bit positions for a given item using double hashing.
171- /// h_i = h1 + i * h2 (mod num_bits)
172- #[ inline( always) ]
173- fn bit_positions < T : Hash > ( & self , item : & T ) -> [ u64 ; NUM_HASHES as usize ] {
174- let mut h1_state = ahash:: AHasher :: default ( ) ;
175- item. hash ( & mut h1_state) ;
176- let h1 = h1_state. finish ( ) ;
177-
178- // Second hash: fold and mix.
179- let h2 = h1
180- . wrapping_mul ( 0x517cc1b727220a95 )
181- . wrapping_add ( 0x6c62272e07bb0142 ) ;
182-
183- let mut positions = [ 0u64 ; NUM_HASHES as usize ] ;
184- for i in 0 ..NUM_HASHES as u64 {
185- positions[ i as usize ] = ( h1. wrapping_add ( i. wrapping_mul ( h2) ) ) % self . num_bits ;
186- }
187- positions
188- }
189-
190- /// Set bit at position `pos`.
191- #[ inline( always) ]
192- fn set_bit ( & mut self , pos : u64 ) {
193- let byte_idx = ( pos / 8 ) as usize ;
194- let bit_idx = ( pos % 8 ) as u8 ;
195- debug_assert ! ( byte_idx < self . len_bytes) ;
196- // SAFETY: `pos < num_bits` (enforced by modulo in `bit_positions`),
197- // and `num_bits == len_bytes * 8`, so `byte_idx < len_bytes` always holds.
198- unsafe {
199- let byte = & mut * self . ptr . add ( byte_idx) ;
200- * byte |= 1 << bit_idx;
201- }
202- }
203-
204- /// Test bit at position `pos`.
173+ /// Compute double-hash seeds for an item.
174+ ///
175+ /// h2 is derived via a MurmurHash3-style finalizer to decorrelate it from
176+ /// h1 — critical for low false-positive rates with power-of-2 masking.
177+ /// The `| 1` forces h2 odd (coprime with any power of 2) so all bit
178+ /// positions are reachable.
205179 #[ inline( always) ]
206- fn test_bit ( & self , pos : u64 ) -> bool {
207- let byte_idx = ( pos / 8 ) as usize ;
208- let bit_idx = ( pos % 8 ) as u8 ;
209- debug_assert ! ( byte_idx < self . len_bytes) ;
210- // SAFETY: same invariant as `set_bit`.
211- unsafe {
212- let byte = * self . ptr . add ( byte_idx) ;
213- byte & ( 1 << bit_idx) != 0
214- }
180+ fn hash_seeds < T : Hash + ?Sized > ( item : & T ) -> ( u64 , u64 ) {
181+ let mut state = ahash:: AHasher :: default ( ) ;
182+ item. hash ( & mut state) ;
183+ let h1 = state. finish ( ) ;
184+ // MurmurHash3 64-bit finalizer — avalanches all bits.
185+ let mut x = h1;
186+ x ^= x >> 33 ;
187+ x = x. wrapping_mul ( 0xff51afd7ed558ccd ) ;
188+ x ^= x >> 33 ;
189+ x = x. wrapping_mul ( 0xc4ceb9fe1a85ec53 ) ;
190+ x ^= x >> 33 ;
191+ ( h1, x | 1 )
215192 }
216193
217194 /// Insert an item into the bloom filter.
195+ ///
196+ /// Computes each bit position inline — no intermediate array.
197+ /// Uses enhanced double hashing: h_i = h1 + i*h2 + i*(i-1)/2 to
198+ /// eliminate correlation artefacts with power-of-2 sizing.
218199 #[ inline]
219- pub fn insert < T : Hash > ( & mut self , item : & T ) {
220- let positions = self . bit_positions ( item) ;
221- for & pos in & positions {
222- self . set_bit ( pos) ;
200+ pub fn insert < T : Hash + ?Sized > ( & mut self , item : & T ) {
201+ let ( h1, h2) = Self :: hash_seeds ( item) ;
202+ let mask = self . mask ;
203+ let mut composite = h1;
204+ for i in 0 ..NUM_HASHES as u64 {
205+ let pos = composite & mask;
206+ let byte_idx = ( pos >> 3 ) as usize ;
207+ let bit_idx = ( pos & 7 ) as u8 ;
208+ // SAFETY: pos < num_bits (mask guarantees), num_bits == len_bytes * 8.
209+ unsafe {
210+ let byte = & mut * self . ptr . add ( byte_idx) ;
211+ * byte |= 1 << bit_idx;
212+ }
213+ // Enhanced double hashing: next = h1 + (i+1)*h2 + (i+1)*i/2
214+ // = composite + h2 + i
215+ composite = composite. wrapping_add ( h2) . wrapping_add ( i) ;
223216 }
224217 self . count += 1 ;
225218 }
226219
227220 /// Check if an item is probably in the set.
228- /// Returns `false` only when the item is *definitely* absent.
221+ ///
222+ /// Returns `false` as soon as *any* bit is unset — on the common "absent"
223+ /// path this exits after testing only 1-2 bits instead of all 7.
229224 #[ inline]
230- pub fn contains < T : Hash > ( & self , item : & T ) -> bool {
231- let positions = self . bit_positions ( item) ;
232- positions. iter ( ) . all ( |& pos| self . test_bit ( pos) )
225+ pub fn contains < T : Hash + ?Sized > ( & self , item : & T ) -> bool {
226+ let ( h1, h2) = Self :: hash_seeds ( item) ;
227+ let mask = self . mask ;
228+ let mut composite = h1;
229+ for i in 0 ..NUM_HASHES as u64 {
230+ let pos = composite & mask;
231+ let byte_idx = ( pos >> 3 ) as usize ;
232+ let bit_idx = ( pos & 7 ) as u8 ;
233+ // SAFETY: same invariant as `insert`.
234+ let set = unsafe {
235+ let byte = * self . ptr . add ( byte_idx) ;
236+ byte & ( 1 << bit_idx) != 0
237+ } ;
238+ if !set {
239+ return false ;
240+ }
241+ composite = composite. wrapping_add ( h2) . wrapping_add ( i) ;
242+ }
243+ true
233244 }
234245
235246 /// Approximate number of insertions performed.
@@ -295,7 +306,7 @@ impl Drop for MmapBloom {
295306impl std:: fmt:: Debug for MmapBloom {
296307 fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
297308 f. debug_struct ( "MmapBloom" )
298- . field ( "num_bits" , & self . num_bits )
309+ . field ( "num_bits" , & ( self . mask + 1 ) )
299310 . field ( "count" , & self . count )
300311 . field ( "size_bytes" , & self . len_bytes )
301312 . field ( "alloc_kind" , & self . alloc_kind )
@@ -314,7 +325,7 @@ impl Clone for MmapBloom {
314325 Self {
315326 ptr,
316327 len_bytes : self . len_bytes ,
317- num_bits : self . num_bits ,
328+ mask : self . mask ,
318329 count : self . count ,
319330 alloc_kind,
320331 }
@@ -427,9 +438,9 @@ mod tests {
427438 #[ test]
428439 fn test_size_reasonable ( ) {
429440 let bloom = MmapBloom :: new ( 1_000_000 ) ;
430- // For 1M items at 1% FP: ~1.2 MB
441+ // For 1M items at 1% FP: ~1.2 MB optimal, rounded to next power of 2 → 2 MB.
431442 assert ! ( bloom. size_bytes( ) > 1_000_000 ) ;
432- assert ! ( bloom. size_bytes( ) < 2_000_000 ) ;
443+ assert ! ( bloom. size_bytes( ) <= 2_097_152 ) ; // 2 MiB (16 Mbit)
433444 }
434445
435446 #[ test]
@@ -442,9 +453,9 @@ mod tests {
442453 #[ test]
443454 fn test_optimal_bits ( ) {
444455 let bits = optimal_bits ( 1_000_000 , 0.01 ) ;
445- // Should be ~9.58M bits ≈ 1.2 MB
446- assert ! ( bits > 9_000_000 ) ;
447- assert ! ( bits < 10_000_000 ) ;
456+ // ~9.58M optimal → next power of 2 = 16_777_216 (2^24)
457+ assert ! ( bits. is_power_of_two ( ) ) ;
458+ assert_eq ! ( bits, 16_777_216 ) ;
448459 }
449460
450461 #[ test]
0 commit comments