Skip to content

Commit 6f72e25

Browse files
authored
Merge pull request #50 from rickhull/master
rewrite the pure ruby example
2 parents 2775674 + b278c29 commit 6f72e25

File tree

1 file changed

+106
-32
lines changed

1 file changed

+106
-32
lines changed

examples/pure-ruby-bf.rb

Lines changed: 106 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,126 @@
1-
#
2-
# Pure ruby implementation of a Bloom filter, just for kicks
3-
#
4-
5-
require 'bitset'
6-
require 'zlib'
1+
require 'bitset' # gem
2+
require 'zlib' # stdlib
3+
require 'digest' # stdlib
74

85
class BloomFilter
6+
# return an array of bit indices ("on bits") via repeated string hashing
7+
# start with the fastest/cheapest algos, up to 8 rounds
8+
# beyond that, perform cyclic "hashing" with CRC32
9+
def self.hash_bits(str, num_hashes:, num_bits:)
10+
val = 0 # for cyclic hashing
11+
Array.new(num_hashes) { |i|
12+
case i
13+
when 0 then str.hash
14+
when 1 then Zlib.crc32(str)
15+
when 2 then Digest::MD5.hexdigest(str).to_i(16)
16+
when 3 then Digest::SHA1.hexdigest(str).to_i(16)
17+
when 4 then Digest::SHA256.hexdigest(str).to_i(16)
18+
when 5 then Digest::SHA384.hexdigest(str).to_i(16)
19+
when 6 then Digest::SHA512.hexdigest(str).to_i(16)
20+
when 7 then Digest::RMD160.hexdigest(str).to_i(16)
21+
else # cyclic hashing with CRC32
22+
val = Zlib.crc32(str, val)
23+
end % num_bits
24+
}
25+
end
26+
27+
attr_reader :bitmap
928

10-
def initialize(max_entries, num_hashes, seed)
29+
# The default values require 8 kilobytes of storage and recognize:
30+
# < 4000 strings: FPR 0.1%
31+
# < 7000 strings: FPR 1%
32+
# > 10k strings: FPR 5%
33+
# The false positive rate goes up as more strings are added
34+
def initialize(num_bits: 2**16, num_hashes: 5)
35+
@num_bits = num_bits
1136
@num_hashes = num_hashes
12-
@size = max_entries.to_i
13-
@bitmap = Bitset.new(@size)
14-
@__mask = Bitset.new(@size)
15-
@seed = seed
37+
@bitmap = Bitset.new(@num_bits)
1638
end
1739

18-
def insert(key)
19-
mask = make_mask(key)
20-
@bitmap |= mask
40+
def hash_bits(str)
41+
self.class.hash_bits(str, num_hashes: @num_hashes, num_bits: @num_bits)
2142
end
2243

23-
def new?(key)
24-
mask = make_mask(key)
25-
return ((@bitmap & mask) != mask);
44+
def add(str)
45+
@bitmap.set *self.hash_bits(str)
2646
end
47+
alias_method(:<<, :add)
2748

28-
def make_mask(key)
29-
@__mask.clear
30-
0.upto(@num_hashes.to_i - 1) do |i|
31-
hash = Zlib.crc32(key, i + @seed)
32-
@__mask.set(hash % @size, 1)
33-
end
34-
return @__mask
49+
def include?(str)
50+
@bitmap.set? *self.hash_bits(str)
51+
end
52+
53+
def likelihood(str)
54+
self.include?(str) ? 1.0 - self.fpr : 0
55+
end
56+
alias_method(:[], :likelihood)
57+
58+
def percent_full
59+
@bitmap.to_a.count.to_f / @num_bits
60+
end
61+
62+
def fpr
63+
self.percent_full**@num_hashes
3564
end
65+
66+
def to_s
67+
format("%i bits (%.1f kB, %i hashes) %i%% full; FPR: %.3f%%",
68+
@num_bits, @num_bits.to_f / 2**13, @num_hashes,
69+
self.percent_full * 100, self.fpr * 100)
70+
end
71+
alias_method(:inspect, :to_s)
3672
end
3773

38-
def main
39-
bf = BloomFilter.new(1000000, 4, 0)
74+
if __FILE__ == $0
75+
puts "Enter strings into the filter; empty line to display filter status"
76+
puts "Two empty lines to quit"
77+
puts
78+
79+
bf = BloomFilter.new(num_bits: 512, num_hashes: 5)
4080
num = 0
41-
while line = ARGF.gets
42-
data = line.chop
81+
last = ''
4382

44-
if bf.new?(data)
83+
# ingest loop
84+
while str = $stdin.gets&.chomp
85+
if str.empty?
86+
puts bf
87+
break if last.empty?
88+
else
89+
bf << str
4590
num += 1
46-
bf.insert(data)
4791
end
92+
last = str
93+
end
94+
95+
puts "ingested #{num} strings"
96+
puts "test if the filter recognizes strings below:"
97+
puts
98+
99+
# test loop
100+
last = ''
101+
while str = $stdin.gets&.chomp
102+
if str.empty?
103+
puts bf
104+
break if last.empty?
105+
else
106+
puts format("%.1f%%\t%s", bf[str] * 100, str)
107+
end
108+
last = str
48109
end
49-
print "#element = #{num}\n"
50110
end
51111

52-
main
112+
113+
# Everything below this line is to enable using this source file as input:
114+
# cat examples/pure-ruby-bf.rb | ruby examples/pure-ruby-bf.rb
115+
# the two newlines above should break the ingest loop
116+
# and now we can put stuff in the test loop:
117+
if false
118+
# nothing in here will execute, but check if we've seen these lines before
119+
# 1. puts (yes)
120+
# 2. ingest loop comment (yes)
121+
# 3. test loop comment (yes)
122+
# 4. end (yes)
123+
puts
124+
# ingest loop
125+
# test loop
126+
end

0 commit comments

Comments
 (0)