Skip to content

Commit 13f4628

Browse files
committed
stop using Digest::MD5 and friends
- just use CRC32 - it's fast and works very well for purpose
1 parent 6f72e25 commit 13f4628

File tree

1 file changed

+25
-40
lines changed

1 file changed

+25
-40
lines changed

examples/pure-ruby-bf.rb

+25-40
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,56 @@
1-
require 'bitset' # gem
21
require 'zlib' # stdlib
3-
require 'digest' # stdlib
2+
require 'bitset' # gem
43

54
class BloomFilter
6-
# return an array of bit indices ("on bits") via repeated string hashing
7-
# start with the fastest/cheapest algos, up to 8 rounds
8-
# beyond that, perform cyclic "hashing" with CRC32
9-
def self.hash_bits(str, num_hashes:, num_bits:)
10-
val = 0 # for cyclic hashing
11-
Array.new(num_hashes) { |i|
12-
case i
13-
when 0 then str.hash
14-
when 1 then Zlib.crc32(str)
15-
when 2 then Digest::MD5.hexdigest(str).to_i(16)
16-
when 3 then Digest::SHA1.hexdigest(str).to_i(16)
17-
when 4 then Digest::SHA256.hexdigest(str).to_i(16)
18-
when 5 then Digest::SHA384.hexdigest(str).to_i(16)
19-
when 6 then Digest::SHA512.hexdigest(str).to_i(16)
20-
when 7 then Digest::RMD160.hexdigest(str).to_i(16)
21-
else # cyclic hashing with CRC32
22-
val = Zlib.crc32(str, val)
23-
end % num_bits
24-
}
25-
end
5+
MAX_BITS = 2**32 # CRC32 yields 32-bit values
266

27-
attr_reader :bitmap
7+
attr_reader :bits, :aspects, :bitmap
288

299
# The default values require 8 kilobytes of storage and recognize:
30-
# < 4000 strings: FPR 0.1%
31-
# < 7000 strings: FPR 1%
32-
# > 10k strings: FPR 5%
33-
# The false positive rate goes up as more strings are added
34-
def initialize(num_bits: 2**16, num_hashes: 5)
35-
@num_bits = num_bits
36-
@num_hashes = num_hashes
37-
@bitmap = Bitset.new(@num_bits)
10+
# < 7000 strings at 1% False Positive Rate (4k @ 0.1%) (10k @ 5%)
11+
# FPR goes up as more strings are added
12+
def initialize(bits: 2**16, aspects: 5)
13+
@bits = bits
14+
raise("bits: #{@bits}") if @bits > MAX_BITS
15+
@aspects = aspects
16+
@bitmap = Bitset.new(@bits)
3817
end
3918

40-
def hash_bits(str)
41-
self.class.hash_bits(str, num_hashes: @num_hashes, num_bits: @num_bits)
19+
# Return an array of bit indices ("on bits") corresponding to
20+
# multiple rounds of string hashing (CRC32 is fast and ~fine~)
21+
def aspect_bits(str)
22+
val = 0
23+
Array.new(@aspects) { (val = Zlib.crc32(str, val)) % @bits }
4224
end
4325

4426
def add(str)
45-
@bitmap.set *self.hash_bits(str)
27+
@bitmap.set(*self.aspect_bits(str))
4628
end
4729
alias_method(:<<, :add)
4830

31+
# true or false; a `true` result may be a "false positive"
4932
def include?(str)
50-
@bitmap.set? *self.hash_bits(str)
33+
@bitmap.set?(*self.aspect_bits(str))
5134
end
5235

36+
# returns either 0 or a number like 0.95036573
5337
def likelihood(str)
5438
self.include?(str) ? 1.0 - self.fpr : 0
5539
end
5640
alias_method(:[], :likelihood)
5741

42+
# relatively expensive; don't test against this in a loop
5843
def percent_full
59-
@bitmap.to_a.count.to_f / @num_bits
44+
@bitmap.to_a.count.to_f / @bits
6045
end
6146

6247
def fpr
63-
self.percent_full**@num_hashes
48+
self.percent_full**@aspects
6449
end
6550

6651
def to_s
67-
format("%i bits (%.1f kB, %i hashes) %i%% full; FPR: %.3f%%",
68-
@num_bits, @num_bits.to_f / 2**13, @num_hashes,
52+
format("%i bits (%.1f kB, %i aspects) %i%% full; FPR: %.3f%%",
53+
@bits, @bits.to_f / 2**13, @aspects,
6954
self.percent_full * 100, self.fpr * 100)
7055
end
7156
alias_method(:inspect, :to_s)
@@ -76,7 +61,7 @@ def to_s
7661
puts "Two empty lines to quit"
7762
puts
7863

79-
bf = BloomFilter.new(num_bits: 512, num_hashes: 5)
64+
bf = BloomFilter.new(bits: 512, aspects: 5)
8065
num = 0
8166
last = ''
8267

0 commit comments

Comments
 (0)