Skip to content

Commit 0d1d95c

Browse files
committed
Initial try to support a variety of byte sizes for cuckoofilter.
Using generics, allowing to switch transparently between uint8 and uint16 for the fingerprint size. Performance neutral according to benchmarks ~/goworkspace/bin/benchstat ~/master.benchstats ~/generics.benchstats name old time/op new time/op delta Filter_Reset-4 10.0µs ± 0% 9.9µs ± 0% ~ (p=1.000 n=1+1) Filter_Insert-4 18.2µs ± 0% 18.0µs ± 0% ~ (p=1.000 n=1+1) Filter_Lookup-4 1.52µs ± 0% 1.51µs ± 0% ~ (p=1.000 n=1+1)
1 parent b9d73bc commit 0d1d95c

File tree

9 files changed

+94
-55
lines changed

9 files changed

+94
-55
lines changed

bucket.go

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,21 @@ import (
55
"fmt"
66
)
77

8-
// fingerprint represents a single entry in a bucket.
9-
type fingerprint uint16
8+
type fingerprintsize interface {
9+
uint8 | uint16 | uint32
10+
}
1011

1112
// bucket keeps track of fingerprints hashing to the same index.
12-
type bucket [bucketSize]fingerprint
13+
type bucket[T fingerprintsize] [bucketSize]T
1314

1415
const (
15-
nullFp = 0
16-
bucketSize = 4
17-
fingerprintSizeBits = 16
18-
maxFingerprint = (1 << fingerprintSizeBits) - 1
16+
nullFp = 0
17+
bucketSize = 4
1918
)
2019

2120
// insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded.
2221
// Note it allows inserting the same fingerprint multiple times.
23-
func (b *bucket) insert(fp fingerprint) bool {
22+
func (b *bucket[T]) insert(fp T) bool {
2423
for i, tfp := range b {
2524
if tfp == nullFp {
2625
b[i] = fp
@@ -32,7 +31,7 @@ func (b *bucket) insert(fp fingerprint) bool {
3231

3332
// delete a fingerprint from a bucket.
3433
// Returns true if the fingerprint was present and successfully removed.
35-
func (b *bucket) delete(fp fingerprint) bool {
34+
func (b *bucket[T]) delete(fp T) bool {
3635
for i, tfp := range b {
3736
if tfp == fp {
3837
b[i] = nullFp
@@ -42,7 +41,7 @@ func (b *bucket) delete(fp fingerprint) bool {
4241
return false
4342
}
4443

45-
func (b *bucket) contains(needle fingerprint) bool {
44+
func (b *bucket[T]) contains(needle T) bool {
4645
for _, fp := range b {
4746
if fp == needle {
4847
return true
@@ -52,13 +51,13 @@ func (b *bucket) contains(needle fingerprint) bool {
5251
}
5352

5453
// reset deletes all fingerprints in the bucket.
55-
func (b *bucket) reset() {
54+
func (b *bucket[T]) reset() {
5655
for i := range b {
5756
b[i] = nullFp
5857
}
5958
}
6059

61-
func (b *bucket) String() string {
60+
func (b *bucket[T]) String() string {
6261
var buf bytes.Buffer
6362
buf.WriteString("[")
6463
for _, by := range b {

bucket_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ import (
66
)
77

88
func TestBucket_Reset(t *testing.T) {
9-
var bkt bucket
10-
for i := fingerprint(0); i < bucketSize; i++ {
9+
var bkt bucket[uint16]
10+
for i := uint16(0); i < bucketSize; i++ {
1111
bkt[i] = i
1212
}
1313
bkt.reset()
1414

15-
var want bucket
15+
var want bucket[uint16]
1616
if !reflect.DeepEqual(bkt, want) {
1717
t.Errorf("bucket.reset() got %v, want %v", bkt, want)
1818
}

cuckoofilter.go

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,37 +11,56 @@ import (
1111
const maxCuckooKickouts = 500
1212

1313
// Filter is a probabilistic counter.
14-
type Filter struct {
15-
buckets []bucket
16-
count uint
14+
type Filter[T fingerprintsize] struct {
15+
buckets []bucket[T]
16+
getFingerprint func(hash uint64) T
17+
count uint
1718
// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
1819
// applying this mask mimics the operation x % len(buckets).
1920
bucketIndexMask uint
2021
}
2122

22-
// NewFilter returns a new cuckoofilter suitable for the given number of elements.
23-
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
24-
// A capacity of 1000000 is a normal default, which allocates
25-
// about ~2MB on 64-bit machines.
26-
func NewFilter(numElements uint) *Filter {
23+
func numBuckets(numElements uint) uint {
2724
numBuckets := getNextPow2(uint64(numElements / bucketSize))
2825
if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 {
2926
numBuckets <<= 1
3027
}
3128
if numBuckets == 0 {
3229
numBuckets = 1
3330
}
34-
buckets := make([]bucket, numBuckets)
35-
return &Filter{
31+
return numBuckets
32+
}
33+
34+
// NewFilter returns a new cuckoofilter suitable for the given number of elements.
35+
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
36+
// A capacity of 1000000 is a normal default, which allocates
37+
// about ~2MB on 64-bit machines.
38+
func NewFilter(numElements uint) *Filter[uint16] {
39+
buckets := make([]bucket[uint16], numBuckets(numElements))
40+
return &Filter[uint16]{
41+
buckets: buckets,
42+
count: 0,
43+
bucketIndexMask: uint(len(buckets) - 1),
44+
getFingerprint: getFinterprintUint16,
45+
}
46+
}
47+
48+
// NewFilterLowPrecision is the same as NewFilter, but returns a filter that uses
49+
// half the memory but has lower precision.
50+
func NewFilterLowPrecision(numElements uint) *Filter[uint8] {
51+
buckets := make([]bucket[uint8], numBuckets(numElements))
52+
return &Filter[uint8]{
3653
buckets: buckets,
3754
count: 0,
3855
bucketIndexMask: uint(len(buckets) - 1),
56+
getFingerprint: getFinterprintUint8,
3957
}
4058
}
4159

60+
4261
// Lookup returns true if data is in the filter.
43-
func (cf *Filter) Lookup(data []byte) bool {
44-
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
62+
func (cf *Filter[T]) Lookup(data []byte) bool {
63+
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
4564
if b := cf.buckets[i1]; b.contains(fp) {
4665
return true
4766
}
@@ -51,7 +70,7 @@ func (cf *Filter) Lookup(data []byte) bool {
5170
}
5271

5372
// Reset removes all items from the filter, setting count to 0.
54-
func (cf *Filter) Reset() {
73+
func (cf *Filter[T]) Reset() {
5574
for i := range cf.buckets {
5675
cf.buckets[i].reset()
5776
}
@@ -62,8 +81,8 @@ func (cf *Filter) Reset() {
6281
// * Might return false negatives
6382
// * Deletes are not guaranteed to work
6483
// To increase success rate of inserts, create a larger filter.
65-
func (cf *Filter) Insert(data []byte) bool {
66-
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
84+
func (cf *Filter[T]) Insert(data []byte) bool {
85+
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
6786
if cf.insert(fp, i1) {
6887
return true
6988
}
@@ -74,15 +93,15 @@ func (cf *Filter) Insert(data []byte) bool {
7493
return cf.reinsert(fp, randi(i1, i2))
7594
}
7695

77-
func (cf *Filter) insert(fp fingerprint, i uint) bool {
96+
func (cf *Filter[T]) insert(fp T, i uint) bool {
7897
if cf.buckets[i].insert(fp) {
7998
cf.count++
8099
return true
81100
}
82101
return false
83102
}
84103

85-
func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
104+
func (cf *Filter[T]) reinsert(fp T, i uint) bool {
86105
for k := 0; k < maxCuckooKickouts; k++ {
87106
j := rand.Intn(bucketSize)
88107
// Swap fingerprint with bucket entry.
@@ -98,13 +117,13 @@ func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
98117
}
99118

100119
// Delete data from the filter. Returns true if the data was found and deleted.
101-
func (cf *Filter) Delete(data []byte) bool {
102-
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
120+
func (cf *Filter[T]) Delete(data []byte) bool {
121+
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
103122
i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
104123
return cf.delete(fp, i1) || cf.delete(fp, i2)
105124
}
106125

107-
func (cf *Filter) delete(fp fingerprint, i uint) bool {
126+
func (cf *Filter[T]) delete(fp T, i uint) bool {
108127
if cf.buckets[i].delete(fp) {
109128
cf.count--
110129
return true
@@ -113,19 +132,20 @@ func (cf *Filter) delete(fp fingerprint, i uint) bool {
113132
}
114133

115134
// Count returns the number of items in the filter.
116-
func (cf *Filter) Count() uint {
135+
func (cf *Filter[T]) Count() uint {
117136
return cf.count
118137
}
119138

120139
// LoadFactor returns the fraction slots that are occupied.
121-
func (cf *Filter) LoadFactor() float64 {
140+
func (cf *Filter[T]) LoadFactor() float64 {
122141
return float64(cf.count) / float64(len(cf.buckets)*bucketSize)
123142
}
124143

125-
const bytesPerBucket = bucketSize * fingerprintSizeBits / 8
144+
// TODO(panmari): Size of fingerprint needs to be derived from type. Currently hardcoded to 16 for uint16.
145+
const bytesPerBucket = bucketSize * 16 / 8
126146

127147
// Encode returns a byte slice representing a Cuckoofilter.
128-
func (cf *Filter) Encode() []byte {
148+
func (cf *Filter[T]) Encode() []byte {
129149
bytes := make([]byte, 0, len(cf.buckets)*bytesPerBucket)
130150
for _, b := range cf.buckets {
131151
for _, f := range b {
@@ -138,7 +158,7 @@ func (cf *Filter) Encode() []byte {
138158
}
139159

140160
// Decode returns a Cuckoofilter from a byte slice created using Encode.
141-
func Decode(bytes []byte) (*Filter, error) {
161+
func Decode(bytes []byte) (*Filter[uint16], error) {
142162
if len(bytes)%bucketSize != 0 {
143163
return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(bytes))
144164
}
@@ -150,21 +170,22 @@ func Decode(bytes []byte) (*Filter, error) {
150170
return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets)
151171
}
152172
var count uint
153-
buckets := make([]bucket, numBuckets)
173+
buckets := make([]bucket[uint16], numBuckets)
154174
for i, b := range buckets {
155175
for j := range b {
156176
var next []byte
157177
next, bytes = bytes[:2], bytes[2:]
158178

159-
if fp := fingerprint(binary.LittleEndian.Uint16(next)); fp != 0 {
179+
if fp := binary.LittleEndian.Uint16(next); fp != 0 {
160180
buckets[i][j] = fp
161181
count++
162182
}
163183
}
164184
}
165-
return &Filter{
185+
return &Filter[uint16]{
166186
buckets: buckets,
167187
count: count,
168188
bucketIndexMask: uint(len(buckets) - 1),
189+
getFingerprint: getFinterprintUint16,
169190
}, nil
170191
}

cuckoofilter_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ import (
77
"io"
88
"math"
99
"os"
10-
"reflect"
1110
"testing"
1211

1312
"github.com/google/go-cmp/cmp"
13+
"github.com/google/go-cmp/cmp/cmpopts"
1414
)
1515

1616
// optFloatNear considers float64 as equal if the relative delta is small.
@@ -231,7 +231,9 @@ func TestEncodeDecode(t *testing.T) {
231231
if err != nil {
232232
t.Errorf("Expected no error, got %v", err)
233233
}
234-
if !reflect.DeepEqual(cf, got) {
234+
if !cmp.Equal(cf, got,
235+
cmp.AllowUnexported(Filter[uint16]{}),
236+
cmpopts.IgnoreFields(Filter[uint16]{}, "getFingerprint")) {
235237
t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded)
236238
}
237239
}

example_threadsafe_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99

1010
// Small wrapper around cuckoo filter making it thread safe.
1111
type threadSafeFilter struct {
12-
cf *cuckoo.Filter
12+
cf *cuckoo.Filter[uint16]
1313
mu sync.RWMutex
1414
}
1515

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
module github.com/panmari/cuckoofilter
22

3-
go 1.15
3+
go 1.18
44

55
require (
6-
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165
6+
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446
77
github.com/google/go-cmp v0.5.2
88
)
9+
10+
require golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 // indirect

go.sum

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E=
2-
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
1+
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446 h1:QnWGyQI3H080vbC9E4jlr6scOYEnALtvV/69oATYzOo=
2+
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
33
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
44
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
5+
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
56
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

util.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,35 @@ func randi(i1, i2 uint) uint {
1515
return i2
1616
}
1717

18-
func getAltIndex(fp fingerprint, i uint, bucketIndexMask uint) uint {
18+
func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint {
1919
b := make([]byte, 2)
2020
binary.LittleEndian.PutUint16(b, uint16(fp))
2121
hash := uint(metro.Hash64(b, 1337))
2222
return (i ^ hash) & bucketIndexMask
2323
}
2424

25-
func getFingerprint(hash uint64) fingerprint {
25+
func getFinterprintUint16(hash uint64) uint16 {
26+
const fingerprintSizeBits = 16
27+
const maxFingerprint = (1 << fingerprintSizeBits) - 1
2628
// Use most significant bits for fingerprint.
2729
shifted := hash >> (64 - fingerprintSizeBits)
2830
// Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state.
2931
fp := shifted%(maxFingerprint-1) + 1
30-
return fingerprint(fp)
32+
return uint16(fp)
33+
}
34+
35+
func getFinterprintUint8(hash uint64) uint8 {
36+
const fingerprintSizeBits = 8
37+
const maxFingerprint = (1 << fingerprintSizeBits) - 1
38+
// Use most significant bits for fingerprint.
39+
shifted := hash >> (64 - fingerprintSizeBits)
40+
// Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state.
41+
fp := shifted%(maxFingerprint-1) + 1
42+
return uint8(fp)
3143
}
3244

3345
// getIndexAndFingerprint returns the primary bucket index and fingerprint to be used
34-
func getIndexAndFingerprint(data []byte, bucketIndexMask uint) (uint, fingerprint) {
46+
func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, getFingerprint func(uint64) T) (uint, T) {
3547
hash := metro.Hash64(data, 1337)
3648
f := getFingerprint(hash)
3749
// Use least significant bits for deriving index.

util_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ import (
77
func TestIndexAndFP(t *testing.T) {
88
data := []byte("seif")
99
numBuckets := uint(1024)
10-
i1, fp := getIndexAndFingerprint(data, numBuckets)
10+
i1, fp := getIndexAndFingerprint(data, numBuckets, func(in uint64) uint16 {
11+
return 2
12+
})
1113
i2 := getAltIndex(fp, i1, numBuckets)
1214
i11 := getAltIndex(fp, i2, numBuckets)
1315
i22 := getAltIndex(fp, i1, numBuckets)

0 commit comments

Comments
 (0)