|
1 |
| -# |
2 |
| -# Pure ruby implementation of a Bloom filter, just for kicks |
3 |
| -# |
4 |
| - |
5 |
| -require 'bitset' |
6 |
| -require 'zlib' |
| 1 | +require 'bitset' # gem |
| 2 | +require 'zlib' # stdlib |
| 3 | +require 'digest' # stdlib |
7 | 4 |
|
8 | 5 | class BloomFilter
|
| 6 | + # return an array of bit indices ("on bits") via repeated string hashing |
| 7 | + # start with the fastest/cheapest algos, up to 8 rounds |
| 8 | + # beyond that, perform cyclic "hashing" with CRC32 |
| 9 | + def self.hash_bits(str, num_hashes:, num_bits:) |
| 10 | + val = 0 # for cyclic hashing |
| 11 | + Array.new(num_hashes) { |i| |
| 12 | + case i |
| 13 | + when 0 then str.hash |
| 14 | + when 1 then Zlib.crc32(str) |
| 15 | + when 2 then Digest::MD5.hexdigest(str).to_i(16) |
| 16 | + when 3 then Digest::SHA1.hexdigest(str).to_i(16) |
| 17 | + when 4 then Digest::SHA256.hexdigest(str).to_i(16) |
| 18 | + when 5 then Digest::SHA384.hexdigest(str).to_i(16) |
| 19 | + when 6 then Digest::SHA512.hexdigest(str).to_i(16) |
| 20 | + when 7 then Digest::RMD160.hexdigest(str).to_i(16) |
| 21 | + else # cyclic hashing with CRC32 |
| 22 | + val = Zlib.crc32(str, val) |
| 23 | + end % num_bits |
| 24 | + } |
| 25 | + end |
| 26 | + |
| 27 | + attr_reader :bitmap |
9 | 28 |
|
10 |
| - def initialize(max_entries, num_hashes, seed) |
| 29 | + # The default values require 8 kilobytes of storage and recognize: |
| 30 | + # < 4000 strings: FPR 0.1% |
| 31 | + # < 7000 strings: FPR 1% |
| 32 | + # > 10k strings: FPR 5% |
| 33 | + # The false positive rate goes up as more strings are added |
| 34 | + def initialize(num_bits: 2**16, num_hashes: 5) |
| 35 | + @num_bits = num_bits |
11 | 36 | @num_hashes = num_hashes
|
12 |
| - @size = max_entries.to_i |
13 |
| - @bitmap = Bitset.new(@size) |
14 |
| - @__mask = Bitset.new(@size) |
15 |
| - @seed = seed |
| 37 | + @bitmap = Bitset.new(@num_bits) |
16 | 38 | end
|
17 | 39 |
|
18 |
| - def insert(key) |
19 |
| - mask = make_mask(key) |
20 |
| - @bitmap |= mask |
| 40 | + def hash_bits(str) |
| 41 | + self.class.hash_bits(str, num_hashes: @num_hashes, num_bits: @num_bits) |
21 | 42 | end
|
22 | 43 |
|
23 |
| - def new?(key) |
24 |
| - mask = make_mask(key) |
25 |
| - return ((@bitmap & mask) != mask); |
| 44 | + def add(str) |
| 45 | + @bitmap.set *self.hash_bits(str) |
26 | 46 | end
|
| 47 | + alias_method(:<<, :add) |
27 | 48 |
|
28 |
| - def make_mask(key) |
29 |
| - @__mask.clear |
30 |
| - 0.upto(@num_hashes.to_i - 1) do |i| |
31 |
| - hash = Zlib.crc32(key, i + @seed) |
32 |
| - @__mask.set(hash % @size, 1) |
33 |
| - end |
34 |
| - return @__mask |
| 49 | + def include?(str) |
| 50 | + @bitmap.set? *self.hash_bits(str) |
| 51 | + end |
| 52 | + |
| 53 | + def likelihood(str) |
| 54 | + self.include?(str) ? 1.0 - self.fpr : 0 |
| 55 | + end |
| 56 | + alias_method(:[], :likelihood) |
| 57 | + |
| 58 | + def percent_full |
| 59 | + @bitmap.to_a.count.to_f / @num_bits |
| 60 | + end |
| 61 | + |
| 62 | + def fpr |
| 63 | + self.percent_full**@num_hashes |
35 | 64 | end
|
| 65 | + |
| 66 | + def to_s |
| 67 | + format("%i bits (%.1f kB, %i hashes) %i%% full; FPR: %.3f%%", |
| 68 | + @num_bits, @num_bits.to_f / 2**13, @num_hashes, |
| 69 | + self.percent_full * 100, self.fpr * 100) |
| 70 | + end |
| 71 | + alias_method(:inspect, :to_s) |
36 | 72 | end
|
37 | 73 |
|
38 |
| -def main |
39 |
| - bf = BloomFilter.new(1000000, 4, 0) |
| 74 | +if __FILE__ == $0 |
| 75 | + puts "Enter strings into the filter; empty line to display filter status" |
| 76 | + puts "Two empty lines to quit" |
| 77 | + puts |
| 78 | + |
| 79 | + bf = BloomFilter.new(num_bits: 512, num_hashes: 5) |
40 | 80 | num = 0
|
41 |
| - while line = ARGF.gets |
42 |
| - data = line.chop |
| 81 | + last = '' |
43 | 82 |
|
44 |
| - if bf.new?(data) |
| 83 | + # ingest loop |
| 84 | + while str = $stdin.gets&.chomp |
| 85 | + if str.empty? |
| 86 | + puts bf |
| 87 | + break if last.empty? |
| 88 | + else |
| 89 | + bf << str |
45 | 90 | num += 1
|
46 |
| - bf.insert(data) |
47 | 91 | end
|
| 92 | + last = str |
| 93 | + end |
| 94 | + |
| 95 | + puts "ingested #{num} strings" |
| 96 | + puts "test if the filter recognizes strings below:" |
| 97 | + puts |
| 98 | + |
| 99 | + # test loop |
| 100 | + last = '' |
| 101 | + while str = $stdin.gets&.chomp |
| 102 | + if str.empty? |
| 103 | + puts bf |
| 104 | + break if last.empty? |
| 105 | + else |
| 106 | + puts format("%.1f%%\t%s", bf[str] * 100, str) |
| 107 | + end |
| 108 | + last = str |
48 | 109 | end
|
49 |
| - print "#element = #{num}\n" |
50 | 110 | end
|
51 | 111 |
|
52 |
| -main |
| 112 | + |
| 113 | +# Everything below this line is to enable using this source file as input: |
| 114 | +# cat examples/pure-ruby-bf.rb | ruby examples/pure-ruby-bf.rb |
| 115 | +# the two newlines above should break the ingest loop |
| 116 | +# and now we can put stuff in the test loop: |
| 117 | +if false |
| 118 | + # nothing in here will execute, but check if we've seen these lines before |
| 119 | + # 1. puts (yes) |
| 120 | + # 2. ingest loop comment (yes) |
| 121 | + # 3. test loop comment (yes) |
| 122 | + # 4. end (yes) |
| 123 | + puts |
| 124 | + # ingest loop |
| 125 | + # test loop |
| 126 | +end |
0 commit comments