refactor: change *Hashed API to accept []byte digest directly

lidel · lidel · commit 3ab5bfe8e9ba · 2026-03-17T19:01:21.000+01:00
SplitDigest is now an internal detail. AddHashed, HasHashed, and
AddIfNotHasHashed accept the digest []byte and extract h1/h2
internally, mirroring the Add(entry []byte) signature.
diff --git a/README.md b/README.md
@@ -39,10 +39,9 @@ bf.HasTS([]byte("peanutbutter"))  // true
 
 // pre-hashed path: skip SipHash when input is a crypto digest
 // (Kirsch-Mitzenmacher optimization, ~4x faster per op)
-digest := sha256Hash(...)         // any crypto digest >= 16 bytes
-h1, h2 := bbloom.SplitDigest(digest)
-bf.AddHashed(h1, h2)
-bf.HasHashed(h1, h2)             // true
+digest := sha256Hash(...)      // any crypto digest >= 16 bytes
+bf.AddHashed(digest)
+bf.HasHashed(digest)           // true
 
 // JSON serialization
 data := bf.JSONMarshal()
diff --git a/bbloom.go b/bbloom.go
@@ -290,39 +290,31 @@ func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
 	return added
 }
 
-// SplitDigest reads two little-endian uint64 values from a cryptographic
-// digest for use with [Bloom.AddHashed] / [Bloom.HasHashed]. h1 comes from
-// digest[0:8], h2 from digest[8:16]. Panics if len(digest) < 16.
-//
-// Typical usage with a go-cid CID and go-multihash:
-//
-//	dm, _ := multihash.Decode(c.Hash())
-//	h1, h2 := bbloom.SplitDigest(dm.Digest)
-//	bf.AddHashed(h1, h2)
-//
-// Digests from different hash functions (SHA2-256, BLAKE2b-256, etc.) can
-// be mixed freely in the same filter -- each produces uniform, independent
-// bytes. Identity multihashes (code 0x00) and digests shorter than 16
-// bytes cannot use this path; use a separate [Bloom] with [Bloom.Add]
-// for those.
-func SplitDigest(digest []byte) (h1, h2 uint64) {
+// splitDigest reads two little-endian uint64 values from a cryptographic
+// digest. h1 comes from digest[0:8], h2 from digest[8:16].
+func splitDigest(digest []byte) (h1, h2 uint64) {
 	_ = digest[15] // bounds check hint
 	h1 = binary.LittleEndian.Uint64(digest[:8])
 	h2 = binary.LittleEndian.Uint64(digest[8:16])
 	return h1, h2
 }
 
-// AddHashed inserts an entry using pre-computed hash values, skipping
-// SipHash entirely. h1 and h2 are typically extracted from a cryptographic
-// digest with [SplitDigest]. h2 is forced odd internally for coprimality
-// with the power-of-two filter size.
+// AddHashed inserts an entry by reading two hash values directly from a
+// cryptographic digest, skipping SipHash entirely. The digest must be at
+// least 16 bytes (true for SHA2-256, BLAKE2b-256, BLAKE3, etc.). Panics
+// if len(digest) < 16.
+//
+// Digests from different hash functions can be mixed freely in the same
+// filter. Identity multihashes (code 0x00) and digests shorter than 16
+// bytes cannot use this path; use a separate [Bloom] with [Bloom.Add].
 //
 // Do not mix AddHashed and [Bloom.Add] on the same filter; they compute
 // different bit positions for the same logical key.
 //
 // Not safe for concurrent use; see [Bloom.AddHashedTS].
-func (bl *Bloom) AddHashed(h1, h2 uint64) {
+func (bl *Bloom) AddHashed(digest []byte) {
 	bl.checkMode(modeHashed)
+	h1, h2 := splitDigest(digest)
 	bl.content++
 	h2 |= 1
 	for i := uint64(0); i < bl.setLocs; i++ {
@@ -331,18 +323,18 @@ func (bl *Bloom) AddHashed(h1, h2 uint64) {
 }
 
 // AddHashedTS is the thread-safe version of [Bloom.AddHashed].
-func (bl *Bloom) AddHashedTS(h1, h2 uint64) {
+func (bl *Bloom) AddHashedTS(digest []byte) {
 	bl.Mtx.Lock()
-	bl.AddHashed(h1, h2)
+	bl.AddHashed(digest)
 	bl.Mtx.Unlock()
 }
 
-// HasHashed reports whether an entry with the given pre-computed hash
-// values is in the filter. Only use on a filter populated with
-// [Bloom.AddHashed], not [Bloom.Add]. Not safe for concurrent use;
-// see [Bloom.HasHashedTS].
-func (bl *Bloom) HasHashed(h1, h2 uint64) bool {
+// HasHashed reports whether an entry with the given cryptographic digest
+// is in the filter. Only use on a filter populated with [Bloom.AddHashed],
+// not [Bloom.Add]. Not safe for concurrent use; see [Bloom.HasHashedTS].
+func (bl *Bloom) HasHashed(digest []byte) bool {
 	bl.checkMode(modeHashed)
+	h1, h2 := splitDigest(digest)
 	h2 |= 1
 	for i := uint64(0); i < bl.setLocs; i++ {
 		if !bl.isSet((h1 + i*h2) & bl.size) {
@@ -353,18 +345,19 @@ func (bl *Bloom) HasHashed(h1, h2 uint64) bool {
 }
 
 // HasHashedTS is the thread-safe version of [Bloom.HasHashed].
-func (bl *Bloom) HasHashedTS(h1, h2 uint64) bool {
+func (bl *Bloom) HasHashedTS(digest []byte) bool {
 	bl.Mtx.RLock()
-	has := bl.HasHashed(h1, h2)
+	has := bl.HasHashed(digest)
 	bl.Mtx.RUnlock()
 	return has
 }
 
 // AddIfNotHasHashed is the pre-hashed variant of [Bloom.AddIfNotHas].
 // Only use on a filter populated with [Bloom.AddHashed], not [Bloom.Add].
 // Not safe for concurrent use; see [Bloom.AddIfNotHasHashedTS].
-func (bl *Bloom) AddIfNotHasHashed(h1, h2 uint64) (added bool) {
+func (bl *Bloom) AddIfNotHasHashed(digest []byte) (added bool) {
 	bl.checkMode(modeHashed)
+	h1, h2 := splitDigest(digest)
 	h2 |= 1
 	contained := true
 	for i := uint64(0); i < bl.setLocs; i++ {
@@ -378,9 +371,9 @@ func (bl *Bloom) AddIfNotHasHashed(h1, h2 uint64) (added bool) {
 }
 
 // AddIfNotHasHashedTS is the thread-safe version of [Bloom.AddIfNotHasHashed].
-func (bl *Bloom) AddIfNotHasHashedTS(h1, h2 uint64) (added bool) {
+func (bl *Bloom) AddIfNotHasHashedTS(digest []byte) (added bool) {
 	bl.Mtx.Lock()
-	added = bl.AddIfNotHasHashed(h1, h2)
+	added = bl.AddIfNotHasHashed(digest)
 	bl.Mtx.Unlock()
 	return added
 }
diff --git a/bench_hashed_test.go b/bench_hashed_test.go
@@ -30,8 +30,6 @@ func makeSHA256Multihashes(n int) (mhs [][]byte, digests [][]byte) {
 var (
 	benchMHs     [][]byte
 	benchDigests [][]byte
-	benchH1s     []uint64
-	benchH2s     []uint64
 )
 
 const benchN = 2_000_000
@@ -41,11 +39,6 @@ func initBenchData() {
 		return
 	}
 	benchMHs, benchDigests = makeSHA256Multihashes(benchN)
-	benchH1s = make([]uint64, benchN)
-	benchH2s = make([]uint64, benchN)
-	for i, d := range benchDigests {
-		benchH1s[i], benchH2s[i] = SplitDigest(d)
-	}
 }
 
 func newBenchBloom() *Bloom {
@@ -71,10 +64,10 @@ func BenchmarkPerOp_Add_SipHash(b *testing.B) {
 func BenchmarkPerOp_Add_Hashed(b *testing.B) {
 	initBenchData()
 	bf := newBenchBloom()
-	h1, h2 := benchH1s[0], benchH2s[0]
+	d := benchDigests[0]
 	b.ResetTimer()
 	for range b.N {
-		bf.AddHashed(h1, h2)
+		bf.AddHashed(d)
 	}
 }
 
@@ -92,11 +85,11 @@ func BenchmarkPerOp_Has_SipHash(b *testing.B) {
 func BenchmarkPerOp_Has_Hashed(b *testing.B) {
 	initBenchData()
 	bf := newBenchBloom()
-	h1, h2 := benchH1s[0], benchH2s[0]
-	bf.AddHashed(h1, h2)
+	d := benchDigests[0]
+	bf.AddHashed(d)
 	b.ResetTimer()
 	for range b.N {
-		bf.HasHashed(h1, h2)
+		bf.HasHashed(d)
 	}
 }
 
@@ -117,8 +110,7 @@ func BenchmarkBulk2M_Add_Hashed(b *testing.B) {
 	for range b.N {
 		bf := newBenchBloom()
 		for _, d := range benchDigests {
-			h1, h2 := SplitDigest(d)
-			bf.AddHashed(h1, h2)
+			bf.AddHashed(d)
 		}
 	}
 }
@@ -141,14 +133,12 @@ func BenchmarkBulk2M_Has_Hashed(b *testing.B) {
 	initBenchData()
 	bf := newBenchBloom()
 	for _, d := range benchDigests {
-		h1, h2 := SplitDigest(d)
-		bf.AddHashed(h1, h2)
+		bf.AddHashed(d)
 	}
 	b.ResetTimer()
 	for range b.N {
 		for _, d := range benchDigests {
-			h1, h2 := SplitDigest(d)
-			bf.HasHashed(h1, h2)
+			bf.HasHashed(d)
 		}
 	}
 }
@@ -168,8 +158,7 @@ func BenchmarkBulk2M_AddIfNotHas_Hashed(b *testing.B) {
 	for range b.N {
 		bf := newBenchBloom()
 		for _, d := range benchDigests {
-			h1, h2 := SplitDigest(d)
-			bf.AddIfNotHasHashed(h1, h2)
+			bf.AddIfNotHasHashed(d)
 		}
 	}
 }
diff --git a/doc.go b/doc.go
@@ -19,9 +19,9 @@
 // # Pre-hashed keys (Kirsch-Mitzenmacher optimization)
 //
 // When the input is already a cryptographic hash digest (e.g. SHA2-256),
-// the SipHash step can be skipped entirely. [SplitDigest] extracts two
-// uint64 values from the digest, and [Bloom.AddHashed] / [Bloom.HasHashed]
-// use them directly as the double-hashing pair. This is based on the result
+// the SipHash step can be skipped entirely. [Bloom.AddHashed] and
+// [Bloom.HasHashed] read two uint64 values directly from the digest bytes
+// and use them as the double-hashing pair. This is based on the result
 // by Kirsch and Mitzenmacher ("Less Hashing, Same Performance: Building a
 // Better Bloom Filter", 2008, https://doi.org/10.1002/rsa.20208) which
 // proves that only two hash values are needed to simulate k independent
diff --git a/hashed_test.go b/hashed_test.go

Original file line number	Diff line number	Diff line change
`@@ -30,8 +30,6 @@ func makeSHA256Multihashes(n int) (mhs [][]byte, digests [][]byte) {`
`30`	`30`	`var (`
`31`	`31`	`benchMHs [][]byte`
`32`	`32`	`benchDigests [][]byte`
`33`		`- benchH1s []uint64`
`34`		`- benchH2s []uint64`
`35`	`33`	`)`
`36`	`34`
`37`	`35`	`const benchN = 2_000_000`
`@@ -41,11 +39,6 @@ func initBenchData() {`
`41`	`39`	`return`
`42`	`40`	`}`
`43`	`41`	`benchMHs, benchDigests = makeSHA256Multihashes(benchN)`
`44`		`- benchH1s = make([]uint64, benchN)`
`45`		`- benchH2s = make([]uint64, benchN)`
`46`		`- for i, d := range benchDigests {`
`47`		`- benchH1s[i], benchH2s[i] = SplitDigest(d)`
`48`		`- }`
`49`	`42`	`}`
`50`	`43`
`51`	`44`	`func newBenchBloom() *Bloom {`
`@@ -71,10 +64,10 @@ func BenchmarkPerOp_Add_SipHash(b *testing.B) {`
`71`	`64`	`func BenchmarkPerOp_Add_Hashed(b *testing.B) {`
`72`	`65`	`initBenchData()`
`73`	`66`	`bf := newBenchBloom()`
`74`		`- h1, h2 := benchH1s[0], benchH2s[0]`
	`67`	`+ d := benchDigests[0]`
`75`	`68`	`b.ResetTimer()`
`76`	`69`	`for range b.N {`
`77`		`- bf.AddHashed(h1, h2)`
	`70`	`+ bf.AddHashed(d)`
`78`	`71`	`}`
`79`	`72`	`}`
`80`	`73`
`@@ -92,11 +85,11 @@ func BenchmarkPerOp_Has_SipHash(b *testing.B) {`
`92`	`85`	`func BenchmarkPerOp_Has_Hashed(b *testing.B) {`
`93`	`86`	`initBenchData()`
`94`	`87`	`bf := newBenchBloom()`
`95`		`- h1, h2 := benchH1s[0], benchH2s[0]`
`96`		`- bf.AddHashed(h1, h2)`
	`88`	`+ d := benchDigests[0]`
	`89`	`+ bf.AddHashed(d)`
`97`	`90`	`b.ResetTimer()`
`98`	`91`	`for range b.N {`
`99`		`- bf.HasHashed(h1, h2)`
	`92`	`+ bf.HasHashed(d)`
`100`	`93`	`}`
`101`	`94`	`}`
`102`	`95`
`@@ -117,8 +110,7 @@ func BenchmarkBulk2M_Add_Hashed(b *testing.B) {`
`117`	`110`	`for range b.N {`
`118`	`111`	`bf := newBenchBloom()`
`119`	`112`	`for _, d := range benchDigests {`
`120`		`- h1, h2 := SplitDigest(d)`
`121`		`- bf.AddHashed(h1, h2)`
	`113`	`+ bf.AddHashed(d)`
`122`	`114`	`}`
`123`	`115`	`}`
`124`	`116`	`}`
`@@ -141,14 +133,12 @@ func BenchmarkBulk2M_Has_Hashed(b *testing.B) {`
`141`	`133`	`initBenchData()`
`142`	`134`	`bf := newBenchBloom()`
`143`	`135`	`for _, d := range benchDigests {`
`144`		`- h1, h2 := SplitDigest(d)`
`145`		`- bf.AddHashed(h1, h2)`
	`136`	`+ bf.AddHashed(d)`
`146`	`137`	`}`
`147`	`138`	`b.ResetTimer()`
`148`	`139`	`for range b.N {`
`149`	`140`	`for _, d := range benchDigests {`
`150`		`- h1, h2 := SplitDigest(d)`
`151`		`- bf.HasHashed(h1, h2)`
	`141`	`+ bf.HasHashed(d)`
`152`	`142`	`}`
`153`	`143`	`}`
`154`	`144`	`}`
`@@ -168,8 +158,7 @@ func BenchmarkBulk2M_AddIfNotHas_Hashed(b *testing.B) {`
`168`	`158`	`for range b.N {`
`169`	`159`	`bf := newBenchBloom()`
`170`	`160`	`for _, d := range benchDigests {`
`171`		`- h1, h2 := SplitDigest(d)`
`172`		`- bf.AddIfNotHasHashed(h1, h2)`
	`161`	`+ bf.AddIfNotHasHashed(d)`
`173`	`162`	`}`
`174`	`163`	`}`
`175`	`164`	`}`