Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 64 additions & 4 deletions bbloom.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ var (
ErrInvalidParms = errors.New("one of the parameters was outside of allowed range")
)

// New creates a bloom filter. It accepts exactly two float64 arguments:
// New creates a bloom filter with default SipHash keys. It accepts exactly
// two float64 arguments:
//
// - If the second parameter is < 1 it is treated as a false-positive rate,
// and the filter is sized automatically.
Expand All @@ -65,6 +66,10 @@ var (
// - If the second parameter is >= 1 it is treated as the number of hash
// locations, and the first parameter is the bitset size.
// Example: New(650000.0, 7.0) -- 650000-bit filter, 7 hash locations.
//
// The default SipHash keys are publicly known constants. If the filter will
// hold data controlled by untrusted parties, use [NewWithKeys] instead to
// prevent hash-flooding attacks.
func New(params ...float64) (bloomfilter *Bloom, err error) {
var entries, locs uint64
if len(params) == 2 {
Expand All @@ -86,11 +91,40 @@ func New(params ...float64) (bloomfilter *Bloom, err error) {
setLocs: locs,
shift: 64 - exponent,
bitset: make([]uint64, size>>6),
k0: defaultK0,
k1: defaultK1,
hashVersion: 1,
}
return bloomfilter, nil
}

// NewWithKeys creates a bloom filter with caller-provided SipHash keys.
//
// The default keys used by [New] are publicly known constants baked into the
// source code. An attacker who knows the keys can craft inputs that all hash
// to the same bit positions, filling the filter faster than normal and raising
// the false-positive rate. This is a concern when the filter holds data
// chosen by untrusted parties (e.g. content-addressed blocks fetched from
// the network).
//
// Providing random, secret keys (e.g. generated once per node from
// crypto/rand) restores SipHash's anti-collision guarantees and makes such
// attacks infeasible.
//
// The params are interpreted the same way as in [New]. Custom keys are
// preserved across [Bloom.JSONMarshal] / [JSONUnmarshal] round-trips.
// Note: custom keys are included in plaintext in the [Bloom.JSONMarshal]
// output, so treat serialized filters accordingly.
func NewWithKeys(k0, k1 uint64, params ...float64) (*Bloom, error) {
bf, err := New(params...)
if err != nil {
return nil, err
}
bf.k0 = k0
bf.k1 = k1
return bf, nil
}

// NewWithBoolset creates a bloom filter from a pre-existing bitset.
// bs is the serialized bitset (big-endian uint64 words) and locs is the
// number of hash locations per entry.
Expand All @@ -105,12 +139,24 @@ func NewWithBoolset(bs []byte, locs uint64) (bloomfilter *Bloom) {
return bloomfilter
}

// NewWithBoolsetAndKeys creates a bloom filter from a pre-existing bitset
// with caller-provided SipHash keys. See [NewWithKeys] for why custom keys
// matter and [NewWithBoolset] for how the bitset is interpreted.
func NewWithBoolsetAndKeys(bs []byte, locs, k0, k1 uint64) (bloomfilter *Bloom) {
bloomfilter = NewWithBoolset(bs, locs)
bloomfilter.k0 = k0
bloomfilter.k1 = k1
return bloomfilter
}

// bloomJSONImExport
// Im/Export structure used by JSONMarshal / JSONUnmarshal
type bloomJSONImExport struct {
FilterSet []byte
SetLocs uint64
Version uint8 `json:"Version,omitempty"`
Version uint8 `json:"Version,omitempty"`
K0 *uint64 `json:"K0,omitempty"`
K1 *uint64 `json:"K1,omitempty"`
}

// Bloom is a bloom filter backed by a power-of-two sized bitset.
Expand All @@ -125,7 +171,8 @@ type Bloom struct {
shift uint64

content uint64
hashVersion uint8 // 0 = legacy, 1 = l|=1 fix (issue #11)
k0, k1 uint64 // SipHash keys
hashVersion uint8 // 0 = legacy, 1 = l|=1 fix (issue #11)
}

// ElementsAdded returns the number of elements added to the bloom filter.
Expand Down Expand Up @@ -256,6 +303,11 @@ func (bl *Bloom) marshal() bloomJSONImExport {
bloomImEx := bloomJSONImExport{}
bloomImEx.SetLocs = uint64(bl.setLocs)
bloomImEx.Version = bl.hashVersion
if bl.k0 != defaultK0 || bl.k1 != defaultK1 {
k0, k1 := bl.k0, bl.k1
bloomImEx.K0 = &k0
bloomImEx.K1 = &k1
}
bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
for i, w := range bl.bitset {
binary.BigEndian.PutUint64(bloomImEx.FilterSet[i<<3:], w)
Expand Down Expand Up @@ -294,7 +346,15 @@ func JSONUnmarshal(dbData []byte) (*Bloom, error) {
if err != nil {
return nil, err
}
bf := NewWithBoolset(bloomImEx.FilterSet, bloomImEx.SetLocs)
if (bloomImEx.K0 == nil) != (bloomImEx.K1 == nil) {
return nil, errors.New("both K0 and K1 must be present or both absent")
}
var bf *Bloom
if bloomImEx.K0 != nil && bloomImEx.K1 != nil {
bf = NewWithBoolsetAndKeys(bloomImEx.FilterSet, bloomImEx.SetLocs, *bloomImEx.K0, *bloomImEx.K1)
} else {
bf = NewWithBoolset(bloomImEx.FilterSet, bloomImEx.SetLocs)
}
bf.hashVersion = bloomImEx.Version
return bf, nil
}
Expand Down
171 changes: 171 additions & 0 deletions bbloom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"math"
"os"
"strconv"
"strings"
"testing"
)

Expand Down Expand Up @@ -81,6 +82,7 @@ func TestM_JSON(t *testing.T) {
t.Errorf("FAILED !AddIfNotHas = %v; want %v", cnt2, shallBe)
}
}

func TestSipHashLowAlwaysOdd(t *testing.T) {
bf, err := New(float64(1<<20), float64(7))
if err != nil {
Expand All @@ -96,6 +98,38 @@ func TestSipHashLowAlwaysOdd(t *testing.T) {
}
}

func TestNewWithKeys(t *testing.T) {
k0 := uint64(0x0123456789abcdef)
k1 := uint64(0xfedcba9876543210)

bf1, err := NewWithKeys(k0, k1, float64(n*10), float64(7))
if err != nil {
t.Fatal(err)
}
bf2, err := New(float64(n*10), float64(7))
if err != nil {
t.Fatal(err)
}

// same entry should hash to different positions with different keys
entry := []byte("test-entry")
l1, h1 := bf1.sipHash(entry)
l2, h2 := bf2.sipHash(entry)
if l1 == l2 && h1 == h2 {
t.Fatal("custom keys produced same hash as default keys")
}

// filter should still work correctly with custom keys
for i := range wordlist1 {
bf1.Add(wordlist1[i])
}
for i := range wordlist1 {
if !bf1.Has(wordlist1[i]) {
t.Fatalf("Has(%q) = false after Add", wordlist1[i])
}
}
}

func TestJSONBackwardCompatV0(t *testing.T) {
// simulate a filter created with the legacy hash (version 0)
bf, err := New(float64(n*10), float64(7))
Expand Down Expand Up @@ -129,6 +163,39 @@ func TestJSONBackwardCompatV0(t *testing.T) {
}
}

func TestNewWithKeysJSON(t *testing.T) {
k0 := uint64(0x0123456789abcdef)
k1 := uint64(0xfedcba9876543210)

bf, err := NewWithKeys(k0, k1, float64(n*10), float64(7))
if err != nil {
t.Fatal(err)
}

entries := wordlist1[:1000]
for _, e := range entries {
bf.Add(e)
}

data := bf.JSONMarshal()

bf2, err := JSONUnmarshal(data)
if err != nil {
t.Fatal(err)
}

// keys should be preserved
if bf2.k0 != k0 || bf2.k1 != k1 {
t.Fatalf("keys not preserved: got k0=%x k1=%x, want k0=%x k1=%x", bf2.k0, bf2.k1, k0, k1)
}

for _, e := range entries {
if !bf2.Has(e) {
t.Fatalf("custom-key filter lost entry %q after JSON round-trip", e)
}
}
}

func TestJSONRoundTripV1(t *testing.T) {
bf, err := New(float64(n*10), float64(7))
if err != nil {
Expand Down Expand Up @@ -158,6 +225,110 @@ func TestJSONRoundTripV1(t *testing.T) {
}
}

func TestJSONUnmarshalPartialKeys(t *testing.T) {
// Only K0 present, K1 absent -- should error, not silently fall back.
jsonK0Only := []byte(`{"FilterSet":"AAAAAAAAAA==","SetLocs":3,"K0":42}`)
if _, err := JSONUnmarshal(jsonK0Only); err == nil {
t.Fatal("expected error for JSON with K0 but no K1")
}

// Only K1 present, K0 absent.
jsonK1Only := []byte(`{"FilterSet":"AAAAAAAAAA==","SetLocs":3,"K1":99}`)
if _, err := JSONUnmarshal(jsonK1Only); err == nil {
t.Fatal("expected error for JSON with K1 but no K0")
}
}

func TestDefaultKeysOmittedFromJSON(t *testing.T) {
bf, err := New(float64(512), float64(3))
if err != nil {
t.Fatal(err)
}
bf.Add([]byte("test"))

data := bf.JSONMarshal()
s := string(data)
if strings.Contains(s, "K0") || strings.Contains(s, "K1") {
t.Fatalf("default keys should not appear in JSON: %s", s)
}

// custom keys should appear
bf2, err := NewWithKeys(42, 99, float64(512), float64(3))
if err != nil {
t.Fatal(err)
}
bf2.Add([]byte("test"))

data2 := bf2.JSONMarshal()
s2 := string(data2)
if !strings.Contains(s2, "K0") || !strings.Contains(s2, "K1") {
t.Fatalf("custom keys should appear in JSON: %s", s2)
}
}

func TestNewWithBoolsetAndKeys(t *testing.T) {
k0 := uint64(0x0123456789abcdef)
k1 := uint64(0xfedcba9876543210)
entries := wordlist1[:1000]

// Build a reference filter with custom keys and populate it.
ref, err := NewWithKeys(k0, k1, float64(n*10), float64(7))
if err != nil {
t.Fatal(err)
}
for _, e := range entries {
ref.Add(e)
}

// Export the raw bitset so we can reconstruct with NewWithBoolsetAndKeys.
rawBitset := ref.JSONMarshal()
refImport, err := JSONUnmarshal(rawBitset)
if err != nil {
t.Fatal(err)
}

t.Run("keys are stored", func(t *testing.T) {
// NewWithBoolsetAndKeys must propagate k0/k1 into the Bloom struct,
// otherwise all lookups will use the wrong hash positions.
got := NewWithBoolsetAndKeys(make([]byte, 64), 7, k0, k1)
if got.k0 != k0 || got.k1 != k1 {
t.Fatalf("keys not set: got k0=%x k1=%x, want k0=%x k1=%x",
got.k0, got.k1, k0, k1)
}
})

t.Run("entries survive bitset round-trip", func(t *testing.T) {
// A filter rebuilt from the same bitset and keys must recognize
// every entry that was added to the original.
for _, e := range entries {
if !refImport.Has(e) {
t.Fatalf("entry %q lost after round-trip", e)
}
}
})

t.Run("wrong keys miss entries", func(t *testing.T) {
// Unmarshal the custom-key filter, then force default keys.
// Lookups must fail, proving the keys actually affect hashing.
wrong, err := JSONUnmarshal(ref.JSONMarshal())
if err != nil {
t.Fatal(err)
}
wrong.k0 = defaultK0
wrong.k1 = defaultK1

misses := 0
for _, e := range entries {
if !wrong.Has(e) {
misses++
}
}
if misses == 0 {
t.Fatal("default keys matched every entry; custom keys had no effect")
}
})
}

func TestFillRatio(t *testing.T) {
bf, err := New(float64(512), float64(7))
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
// bitset up to the next power of two for fast masking, and provides both
// non-thread-safe and mutex-protected (TS-suffixed) variants of all operations.
//
// By default ([New]) the filter uses publicly known SipHash keys. When the
// filter will hold data controlled by untrusted parties, use [NewWithKeys]
// with random secret keys to prevent hash-flooding attacks.
//
// Filters can be serialized to JSON with [Bloom.JSONMarshal] and restored
// with [JSONUnmarshal].
package bbloom
26 changes: 20 additions & 6 deletions sipHash.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,28 @@

package bbloom

// Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit
// parts of 128-bit key: k0 and k1.
// SipHash-2-4 initialization constants.
const (
sipC0 = 0x736f6d6570736575
sipC1 = 0x646f72616e646f6d
sipC2 = 0x6c7967656e657261
sipC3 = 0x7465646279746573
)

// Default SipHash keys (the original hardcoded values: 0xdeadbeaf, 0xfaebdaed).
const (
defaultK0 = uint64(0xdeadbeaf)
defaultK1 = uint64(0xfaebdaed)
)

// sipHash returns the 64-bit SipHash-2-4 of the given byte slice using
// the bloom filter's k0/k1 keys, split into two parts for double-hashing.
func (bl *Bloom) sipHash(p []byte) (l, h uint64) {
// Initialization.
v0 := uint64(8317987320269560794) // k0 ^ 0x736f6d6570736575
v1 := uint64(7237128889637516672) // k1 ^ 0x646f72616e646f6d
v2 := uint64(7816392314733513934) // k0 ^ 0x6c7967656e657261
v3 := uint64(8387220255325274014) // k1 ^ 0x7465646279746573
v0 := bl.k0 ^ sipC0
v1 := bl.k1 ^ sipC1
v2 := bl.k0 ^ sipC2
v3 := bl.k1 ^ sipC3
t := uint64(len(p)) << 56

// Compression.
Expand Down
Loading