Skip to content

Commit 3ab5bfe

Browse files
committed
refactor: change *Hashed API to accept []byte digest directly
SplitDigest is now an internal detail. AddHashed, HasHashed, and AddIfNotHasHashed accept the digest []byte and extract h1/h2 internally, mirroring the Add(entry []byte) signature.
1 parent be27611 commit 3ab5bfe

File tree

5 files changed

+69
-148
lines changed

5 files changed

+69
-148
lines changed

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,9 @@ bf.HasTS([]byte("peanutbutter")) // true
3939

4040
// pre-hashed path: skip SipHash when input is a crypto digest
4141
// (Kirsch-Mitzenmacher optimization, ~4x faster per op)
42-
digest := sha256Hash(...) // any crypto digest >= 16 bytes
43-
h1, h2 := bbloom.SplitDigest(digest)
44-
bf.AddHashed(h1, h2)
45-
bf.HasHashed(h1, h2) // true
42+
digest := sha256Hash(...) // any crypto digest >= 16 bytes
43+
bf.AddHashed(digest)
44+
bf.HasHashed(digest) // true
4645

4746
// JSON serialization
4847
data := bf.JSONMarshal()

bbloom.go

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -290,39 +290,31 @@ func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
290290
return added
291291
}
292292

293-
// SplitDigest reads two little-endian uint64 values from a cryptographic
294-
// digest for use with [Bloom.AddHashed] / [Bloom.HasHashed]. h1 comes from
295-
// digest[0:8], h2 from digest[8:16]. Panics if len(digest) < 16.
296-
//
297-
// Typical usage with a go-cid CID and go-multihash:
298-
//
299-
// dm, _ := multihash.Decode(c.Hash())
300-
// h1, h2 := bbloom.SplitDigest(dm.Digest)
301-
// bf.AddHashed(h1, h2)
302-
//
303-
// Digests from different hash functions (SHA2-256, BLAKE2b-256, etc.) can
304-
// be mixed freely in the same filter -- each produces uniform, independent
305-
// bytes. Identity multihashes (code 0x00) and digests shorter than 16
306-
// bytes cannot use this path; use a separate [Bloom] with [Bloom.Add]
307-
// for those.
308-
func SplitDigest(digest []byte) (h1, h2 uint64) {
293+
// splitDigest reads two little-endian uint64 values from a cryptographic
294+
// digest. h1 comes from digest[0:8], h2 from digest[8:16].
295+
func splitDigest(digest []byte) (h1, h2 uint64) {
309296
_ = digest[15] // bounds check hint
310297
h1 = binary.LittleEndian.Uint64(digest[:8])
311298
h2 = binary.LittleEndian.Uint64(digest[8:16])
312299
return h1, h2
313300
}
314301

315-
// AddHashed inserts an entry using pre-computed hash values, skipping
316-
// SipHash entirely. h1 and h2 are typically extracted from a cryptographic
317-
// digest with [SplitDigest]. h2 is forced odd internally for coprimality
318-
// with the power-of-two filter size.
302+
// AddHashed inserts an entry by reading two hash values directly from a
303+
// cryptographic digest, skipping SipHash entirely. The digest must be at
304+
// least 16 bytes (true for SHA2-256, BLAKE2b-256, BLAKE3, etc.). Panics
305+
// if len(digest) < 16.
306+
//
307+
// Digests from different hash functions can be mixed freely in the same
308+
// filter. Identity multihashes (code 0x00) and digests shorter than 16
309+
// bytes cannot use this path; use a separate [Bloom] with [Bloom.Add].
319310
//
320311
// Do not mix AddHashed and [Bloom.Add] on the same filter; they compute
321312
// different bit positions for the same logical key.
322313
//
323314
// Not safe for concurrent use; see [Bloom.AddHashedTS].
324-
func (bl *Bloom) AddHashed(h1, h2 uint64) {
315+
func (bl *Bloom) AddHashed(digest []byte) {
325316
bl.checkMode(modeHashed)
317+
h1, h2 := splitDigest(digest)
326318
bl.content++
327319
h2 |= 1
328320
for i := uint64(0); i < bl.setLocs; i++ {
@@ -331,18 +323,18 @@ func (bl *Bloom) AddHashed(h1, h2 uint64) {
331323
}
332324

333325
// AddHashedTS is the thread-safe version of [Bloom.AddHashed].
334-
func (bl *Bloom) AddHashedTS(h1, h2 uint64) {
326+
func (bl *Bloom) AddHashedTS(digest []byte) {
335327
bl.Mtx.Lock()
336-
bl.AddHashed(h1, h2)
328+
bl.AddHashed(digest)
337329
bl.Mtx.Unlock()
338330
}
339331

340-
// HasHashed reports whether an entry with the given pre-computed hash
341-
// values is in the filter. Only use on a filter populated with
342-
// [Bloom.AddHashed], not [Bloom.Add]. Not safe for concurrent use;
343-
// see [Bloom.HasHashedTS].
344-
func (bl *Bloom) HasHashed(h1, h2 uint64) bool {
332+
// HasHashed reports whether an entry with the given cryptographic digest
333+
// is in the filter. Only use on a filter populated with [Bloom.AddHashed],
334+
// not [Bloom.Add]. Not safe for concurrent use; see [Bloom.HasHashedTS].
335+
func (bl *Bloom) HasHashed(digest []byte) bool {
345336
bl.checkMode(modeHashed)
337+
h1, h2 := splitDigest(digest)
346338
h2 |= 1
347339
for i := uint64(0); i < bl.setLocs; i++ {
348340
if !bl.isSet((h1 + i*h2) & bl.size) {
@@ -353,18 +345,19 @@ func (bl *Bloom) HasHashed(h1, h2 uint64) bool {
353345
}
354346

355347
// HasHashedTS is the thread-safe version of [Bloom.HasHashed].
356-
func (bl *Bloom) HasHashedTS(h1, h2 uint64) bool {
348+
func (bl *Bloom) HasHashedTS(digest []byte) bool {
357349
bl.Mtx.RLock()
358-
has := bl.HasHashed(h1, h2)
350+
has := bl.HasHashed(digest)
359351
bl.Mtx.RUnlock()
360352
return has
361353
}
362354

363355
// AddIfNotHasHashed is the pre-hashed variant of [Bloom.AddIfNotHas].
364356
// Only use on a filter populated with [Bloom.AddHashed], not [Bloom.Add].
365357
// Not safe for concurrent use; see [Bloom.AddIfNotHasHashedTS].
366-
func (bl *Bloom) AddIfNotHasHashed(h1, h2 uint64) (added bool) {
358+
func (bl *Bloom) AddIfNotHasHashed(digest []byte) (added bool) {
367359
bl.checkMode(modeHashed)
360+
h1, h2 := splitDigest(digest)
368361
h2 |= 1
369362
contained := true
370363
for i := uint64(0); i < bl.setLocs; i++ {
@@ -378,9 +371,9 @@ func (bl *Bloom) AddIfNotHasHashed(h1, h2 uint64) (added bool) {
378371
}
379372

380373
// AddIfNotHasHashedTS is the thread-safe version of [Bloom.AddIfNotHasHashed].
381-
func (bl *Bloom) AddIfNotHasHashedTS(h1, h2 uint64) (added bool) {
374+
func (bl *Bloom) AddIfNotHasHashedTS(digest []byte) (added bool) {
382375
bl.Mtx.Lock()
383-
added = bl.AddIfNotHasHashed(h1, h2)
376+
added = bl.AddIfNotHasHashed(digest)
384377
bl.Mtx.Unlock()
385378
return added
386379
}

bench_hashed_test.go

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ func makeSHA256Multihashes(n int) (mhs [][]byte, digests [][]byte) {
3030
var (
3131
benchMHs [][]byte
3232
benchDigests [][]byte
33-
benchH1s []uint64
34-
benchH2s []uint64
3533
)
3634

3735
const benchN = 2_000_000
@@ -41,11 +39,6 @@ func initBenchData() {
4139
return
4240
}
4341
benchMHs, benchDigests = makeSHA256Multihashes(benchN)
44-
benchH1s = make([]uint64, benchN)
45-
benchH2s = make([]uint64, benchN)
46-
for i, d := range benchDigests {
47-
benchH1s[i], benchH2s[i] = SplitDigest(d)
48-
}
4942
}
5043

5144
func newBenchBloom() *Bloom {
@@ -71,10 +64,10 @@ func BenchmarkPerOp_Add_SipHash(b *testing.B) {
7164
func BenchmarkPerOp_Add_Hashed(b *testing.B) {
7265
initBenchData()
7366
bf := newBenchBloom()
74-
h1, h2 := benchH1s[0], benchH2s[0]
67+
d := benchDigests[0]
7568
b.ResetTimer()
7669
for range b.N {
77-
bf.AddHashed(h1, h2)
70+
bf.AddHashed(d)
7871
}
7972
}
8073

@@ -92,11 +85,11 @@ func BenchmarkPerOp_Has_SipHash(b *testing.B) {
9285
func BenchmarkPerOp_Has_Hashed(b *testing.B) {
9386
initBenchData()
9487
bf := newBenchBloom()
95-
h1, h2 := benchH1s[0], benchH2s[0]
96-
bf.AddHashed(h1, h2)
88+
d := benchDigests[0]
89+
bf.AddHashed(d)
9790
b.ResetTimer()
9891
for range b.N {
99-
bf.HasHashed(h1, h2)
92+
bf.HasHashed(d)
10093
}
10194
}
10295

@@ -117,8 +110,7 @@ func BenchmarkBulk2M_Add_Hashed(b *testing.B) {
117110
for range b.N {
118111
bf := newBenchBloom()
119112
for _, d := range benchDigests {
120-
h1, h2 := SplitDigest(d)
121-
bf.AddHashed(h1, h2)
113+
bf.AddHashed(d)
122114
}
123115
}
124116
}
@@ -141,14 +133,12 @@ func BenchmarkBulk2M_Has_Hashed(b *testing.B) {
141133
initBenchData()
142134
bf := newBenchBloom()
143135
for _, d := range benchDigests {
144-
h1, h2 := SplitDigest(d)
145-
bf.AddHashed(h1, h2)
136+
bf.AddHashed(d)
146137
}
147138
b.ResetTimer()
148139
for range b.N {
149140
for _, d := range benchDigests {
150-
h1, h2 := SplitDigest(d)
151-
bf.HasHashed(h1, h2)
141+
bf.HasHashed(d)
152142
}
153143
}
154144
}
@@ -168,8 +158,7 @@ func BenchmarkBulk2M_AddIfNotHas_Hashed(b *testing.B) {
168158
for range b.N {
169159
bf := newBenchBloom()
170160
for _, d := range benchDigests {
171-
h1, h2 := SplitDigest(d)
172-
bf.AddIfNotHasHashed(h1, h2)
161+
bf.AddIfNotHasHashed(d)
173162
}
174163
}
175164
}

doc.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
// # Pre-hashed keys (Kirsch-Mitzenmacher optimization)
2020
//
2121
// When the input is already a cryptographic hash digest (e.g. SHA2-256),
22-
// the SipHash step can be skipped entirely. [SplitDigest] extracts two
23-
// uint64 values from the digest, and [Bloom.AddHashed] / [Bloom.HasHashed]
24-
// use them directly as the double-hashing pair. This is based on the result
22+
// the SipHash step can be skipped entirely. [Bloom.AddHashed] and
23+
// [Bloom.HasHashed] read two uint64 values directly from the digest bytes
24+
// and use them as the double-hashing pair. This is based on the result
2525
// by Kirsch and Mitzenmacher ("Less Hashing, Same Performance: Building a
2626
// Better Bloom Filter", 2008, https://doi.org/10.1002/rsa.20208) which
2727
// proves that only two hash values are needed to simulate k independent

0 commit comments

Comments
 (0)