1
- require 'bitset' # gem
2
1
require 'zlib' # stdlib
3
- require 'digest ' # stdlib
2
+ require 'bitset ' # gem
4
3
5
4
class BloomFilter
6
- # return an array of bit indices ("on bits") via repeated string hashing
7
- # start with the fastest/cheapest algos, up to 8 rounds
8
- # beyond that, perform cyclic "hashing" with CRC32
9
- def self . hash_bits ( str , num_hashes :, num_bits :)
10
- val = 0 # for cyclic hashing
11
- Array . new ( num_hashes ) { |i |
12
- case i
13
- when 0 then str . hash
14
- when 1 then Zlib . crc32 ( str )
15
- when 2 then Digest ::MD5 . hexdigest ( str ) . to_i ( 16 )
16
- when 3 then Digest ::SHA1 . hexdigest ( str ) . to_i ( 16 )
17
- when 4 then Digest ::SHA256 . hexdigest ( str ) . to_i ( 16 )
18
- when 5 then Digest ::SHA384 . hexdigest ( str ) . to_i ( 16 )
19
- when 6 then Digest ::SHA512 . hexdigest ( str ) . to_i ( 16 )
20
- when 7 then Digest ::RMD160 . hexdigest ( str ) . to_i ( 16 )
21
- else # cyclic hashing with CRC32
22
- val = Zlib . crc32 ( str , val )
23
- end % num_bits
24
- }
25
- end
5
+ MAX_BITS = 2 **32 # CRC32 yields 32-bit values
26
6
27
- attr_reader :bitmap
7
+ attr_reader :bits , :aspects , : bitmap
28
8
29
9
# The default values require 8 kilobytes of storage and recognize:
30
- # < 4000 strings: FPR 0.1%
31
- # < 7000 strings: FPR 1%
32
- # > 10k strings: FPR 5%
33
- # The false positive rate goes up as more strings are added
34
- def initialize ( num_bits : 2 **16 , num_hashes : 5 )
35
- @num_bits = num_bits
36
- @num_hashes = num_hashes
37
- @bitmap = Bitset . new ( @num_bits )
10
+ # < 7000 strings at 1% False Positive Rate (4k @ 0.1%) (10k @ 5%)
11
+ # FPR goes up as more strings are added
12
+ def initialize ( bits : 2 **16 , aspects : 5 )
13
+ @bits = bits
14
+ raise ( "bits: #{ @bits } " ) if @bits > MAX_BITS
15
+ @aspects = aspects
16
+ @bitmap = Bitset . new ( @bits )
38
17
end
39
18
40
- def hash_bits ( str )
41
- self . class . hash_bits ( str , num_hashes : @num_hashes , num_bits : @num_bits )
19
+ # Return an array of bit indices ("on bits") corresponding to
20
+ # multiple rounds of string hashing (CRC32 is fast and ~fine~)
21
+ def aspect_bits ( str )
22
+ val = 0
23
+ Array . new ( @aspects ) { ( val = Zlib . crc32 ( str , val ) ) % @bits }
42
24
end
43
25
44
26
def add ( str )
45
- @bitmap . set *self . hash_bits ( str )
27
+ @bitmap . set ( *self . aspect_bits ( str ) )
46
28
end
47
29
alias_method ( :<< , :add )
48
30
31
+ # true or false; a `true` result may be a "false positive"
49
32
def include? ( str )
50
- @bitmap . set? *self . hash_bits ( str )
33
+ @bitmap . set? ( *self . aspect_bits ( str ) )
51
34
end
52
35
36
+ # returns either 0 or a number like 0.95036573
53
37
def likelihood ( str )
54
38
self . include? ( str ) ? 1.0 - self . fpr : 0
55
39
end
56
40
alias_method ( :[] , :likelihood )
57
41
42
+ # relatively expensive; don't test against this in a loop
58
43
def percent_full
59
- @bitmap . to_a . count . to_f / @num_bits
44
+ @bitmap . to_a . count . to_f / @bits
60
45
end
61
46
62
47
def fpr
63
- self . percent_full **@num_hashes
48
+ self . percent_full **@aspects
64
49
end
65
50
66
51
def to_s
67
- format ( "%i bits (%.1f kB, %i hashes ) %i%% full; FPR: %.3f%%" ,
68
- @num_bits , @num_bits . to_f / 2 **13 , @num_hashes ,
52
+ format ( "%i bits (%.1f kB, %i aspects ) %i%% full; FPR: %.3f%%" ,
53
+ @bits , @bits . to_f / 2 **13 , @aspects ,
69
54
self . percent_full * 100 , self . fpr * 100 )
70
55
end
71
56
alias_method ( :inspect , :to_s )
@@ -76,7 +61,7 @@ def to_s
76
61
puts "Two empty lines to quit"
77
62
puts
78
63
79
- bf = BloomFilter . new ( num_bits : 512 , num_hashes : 5 )
64
+ bf = BloomFilter . new ( bits : 512 , aspects : 5 )
80
65
num = 0
81
66
last = ''
82
67
0 commit comments