From 287646650a3db4da2fc4d8db3ad53aca05ad22a4 Mon Sep 17 00:00:00 2001 From: Rick Hull Date: Sat, 2 Mar 2024 01:46:23 -0500 Subject: [PATCH 1/4] stop using Digest::MD5 and friends - just use CRC32 - it's fast and works very well for purpose --- examples/pure-ruby-bf.rb | 67 ++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/examples/pure-ruby-bf.rb b/examples/pure-ruby-bf.rb index 61eff6c..bacf572 100644 --- a/examples/pure-ruby-bf.rb +++ b/examples/pure-ruby-bf.rb @@ -1,71 +1,56 @@ -require 'bitset' # gem require 'zlib' # stdlib -require 'digest' # stdlib +require 'bitset' # gem class BloomFilter - # return an array of bit indices ("on bits") via repeated string hashing - # start with the fastest/cheapest algos, up to 8 rounds - # beyond that, perform cyclic "hashing" with CRC32 - def self.hash_bits(str, num_hashes:, num_bits:) - val = 0 # for cyclic hashing - Array.new(num_hashes) { |i| - case i - when 0 then str.hash - when 1 then Zlib.crc32(str) - when 2 then Digest::MD5.hexdigest(str).to_i(16) - when 3 then Digest::SHA1.hexdigest(str).to_i(16) - when 4 then Digest::SHA256.hexdigest(str).to_i(16) - when 5 then Digest::SHA384.hexdigest(str).to_i(16) - when 6 then Digest::SHA512.hexdigest(str).to_i(16) - when 7 then Digest::RMD160.hexdigest(str).to_i(16) - else # cyclic hashing with CRC32 - val = Zlib.crc32(str, val) - end % num_bits - } - end + MAX_BITS = 2**32 # CRC32 yields 32-bit values - attr_reader :bitmap + attr_reader :bitsize, :aspects, :bitmap # The default values require 8 kilobytes of storage and recognize: - # < 4000 strings: FPR 0.1% - # < 7000 strings: FPR 1% - # > 10k strings: FPR 5% - # The false positive rate goes up as more strings are added - def initialize(num_bits: 2**16, num_hashes: 5) - @num_bits = num_bits - @num_hashes = num_hashes - @bitmap = Bitset.new(@num_bits) + # < 7000 strings at 1% False Positive Rate (4k @ 0.1%) (10k @ 5%) + # FPR goes up as more strings are added + def initialize(bitsize: 2**16, aspects: 5) + @bitsize = bitsize + raise("bitsize: #{@bitsize}") if @bitsize > MAX_BITS + @aspects = aspects + @bitmap = Bitset.new(@bitsize) end - def hash_bits(str) - self.class.hash_bits(str, num_hashes: @num_hashes, num_bits: @num_bits) + # Return an array of bit indices ("on bits") corresponding to + # multiple rounds of string hashing (CRC32 is fast and ~fine~) + def bits(str) + val = 0 + Array.new(@aspects) { (val = Zlib.crc32(str, val)) % @bitsize } end def add(str) - @bitmap.set *self.hash_bits(str) + @bitmap.set(*self.bits(str)) end alias_method(:<<, :add) + # true or false; a `true` result may be a "false positive" def include?(str) - @bitmap.set? *self.hash_bits(str) + @bitmap.set?(*self.bits(str)) end + # returns either 0 or a number like 0.95036573 def likelihood(str) self.include?(str) ? 1.0 - self.fpr : 0 end alias_method(:[], :likelihood) + # relatively expensive; don't test against this in a loop def percent_full - @bitmap.to_a.count.to_f / @num_bits + @bitmap.to_a.count.to_f / @bitsize end def fpr - self.percent_full**@num_hashes + self.percent_full**@aspects end def to_s - format("%i bits (%.1f kB, %i hashes) %i%% full; FPR: %.3f%%", - @num_bits, @num_bits.to_f / 2**13, @num_hashes, + format("%i bits (%.1f kB, %i aspects) %i%% full; FPR: %.3f%%", + @bitsize, @bitsize.to_f / 2**13, @aspects, self.percent_full * 100, self.fpr * 100) end alias_method(:inspect, :to_s) @@ -76,7 +61,7 @@ def to_s puts "Two empty lines to quit" puts - bf = BloomFilter.new(num_bits: 512, num_hashes: 5) + bf = BloomFilter.new(bitsize: 512, aspects: 5) num = 0 last = '' @@ -103,7 +88,7 @@ def to_s puts bf break if last.empty? else - puts format("%.1f%%\t%s", bf[str] * 100, str) + puts format("%04.1f%% %s \t %s", bf[str] * 100, str, bf.bits(str)) end last = str end From 91172ec011890c2dbd65f61dbcaa3d38ae9f0752 Mon Sep 17 00:00:00 2001 From: Rick Hull Date: Fri, 22 Mar 2024 11:34:54 -0400 Subject: [PATCH 2/4] make this really a pure ruby bloom filter - remove bitset gem, which is a C extension - bitset gem does not currently build on ARM - add pure ruby BitSet implementation - roughly half the performance of the bitset gem in extensive testing - plenty fast for practical use - brought both BitSet and BloomFilter classes up to date with compsci gem - renamed some ivars and methods --- examples/pure-ruby-bf.rb | 77 ++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/examples/pure-ruby-bf.rb b/examples/pure-ruby-bf.rb index bacf572..2856318 100644 --- a/examples/pure-ruby-bf.rb +++ b/examples/pure-ruby-bf.rb @@ -1,36 +1,83 @@ -require 'zlib' # stdlib -require 'bitset' # gem +# stdlib +require 'rbconfig/sizeof' +require 'zlib' + +class BitSet + # in bits, e.g. 64 bit / 32 bit platforms. SIZEOF returns byte width + INT_WIDTH = RbConfig::SIZEOF.fetch('long') * 8 + + # return an array of ones and zeroes, padded to INT_WIDTH + def self.bits(int) + bit_ary = int.digits(2) + bit_ary + Array.new(INT_WIDTH - bit_ary.count, 0) + end + + attr_reader :storage + + # create an array of integers, default 0 + # use flip_even_bits to initialize with every even bit set to 1 + def initialize(num_bits, flip_even_bits: false) + init = flip_even_bits ? (2**INT_WIDTH / 3r).to_i : 0 + @storage = Array.new((num_bits / INT_WIDTH.to_f).ceil, init) + end + + # ensure the given bit_indices are set to 1 + def set(bit_indices) + bit_indices.each { |b| + slot, val = b.divmod(INT_WIDTH) + @storage[slot] |= (1 << val) + } + end + + # determine if all given bit indices are set to 1 + def set?(bit_indices) + bit_indices.all? { |b| + slot, val = b.divmod(INT_WIDTH) + @storage[slot][val] != 0 + } + end + + # returns an array of ones and zeroes, padded to INT_WIDTH + def bits + @storage.flat_map { |i| self.class.bits(i) } + end + + # returns an array of bit indices + def on_bits + self.bits.filter_map.with_index { |b, i| i if b == 1 } + end +end class BloomFilter MAX_BITS = 2**32 # CRC32 yields 32-bit values - attr_reader :bitsize, :aspects, :bitmap + attr_reader :bits, :aspects, :bitmap # The default values require 8 kilobytes of storage and recognize: # < 7000 strings at 1% False Positive Rate (4k @ 0.1%) (10k @ 5%) # FPR goes up as more strings are added - def initialize(bitsize: 2**16, aspects: 5) - @bitsize = bitsize - raise("bitsize: #{@bitsize}") if @bitsize > MAX_BITS + def initialize(bits: 2**16, aspects: 5) + @bits = bits + raise("bits: #{@bits}") if @bits > MAX_BITS @aspects = aspects - @bitmap = Bitset.new(@bitsize) + @bitmap = BitSet.new(@bits) end # Return an array of bit indices ("on bits") corresponding to # multiple rounds of string hashing (CRC32 is fast and ~fine~) - def bits(str) + def index(str) val = 0 - Array.new(@aspects) { (val = Zlib.crc32(str, val)) % @bitsize } + Array.new(@aspects) { (val = Zlib.crc32(str, val)) % @bits } end def add(str) - @bitmap.set(*self.bits(str)) + @bitmap.set(self.index(str)) end alias_method(:<<, :add) # true or false; a `true` result may be a "false positive" def include?(str) - @bitmap.set?(*self.bits(str)) + @bitmap.set?(self.index(str)) end # returns either 0 or a number like 0.95036573 @@ -41,7 +88,7 @@ def likelihood(str) # relatively expensive; don't test against this in a loop def percent_full - @bitmap.to_a.count.to_f / @bitsize + @bitmap.on_bits.count.to_f / @bits end def fpr @@ -50,7 +97,7 @@ def fpr def to_s format("%i bits (%.1f kB, %i aspects) %i%% full; FPR: %.3f%%", - @bitsize, @bitsize.to_f / 2**13, @aspects, + @bits, @bits.to_f / 2**13, @aspects, self.percent_full * 100, self.fpr * 100) end alias_method(:inspect, :to_s) @@ -61,7 +108,7 @@ def to_s puts "Two empty lines to quit" puts - bf = BloomFilter.new(bitsize: 512, aspects: 5) + bf = BloomFilter.new(bits: 512, aspects: 5) num = 0 last = '' @@ -88,7 +135,7 @@ def to_s puts bf break if last.empty? else - puts format("%04.1f%% %s \t %s", bf[str] * 100, str, bf.bits(str)) + puts format("%04.1f%% %s \t %s", bf[str] * 100, str, bf.index(str)) end last = str end From d5c0040f83db56f251714cc84e8ac321c4198c81 Mon Sep 17 00:00:00 2001 From: Rick Hull Date: Fri, 22 Mar 2024 17:42:16 -0400 Subject: [PATCH 3/4] add and improve comments --- examples/pure-ruby-bf.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pure-ruby-bf.rb b/examples/pure-ruby-bf.rb index 2856318..228a98b 100644 --- a/examples/pure-ruby-bf.rb +++ b/examples/pure-ruby-bf.rb @@ -114,10 +114,10 @@ def to_s # ingest loop while str = $stdin.gets&.chomp - if str.empty? + if str.empty? # display status; end the loop on consecutive empty lines puts bf break if last.empty? - else + else # ingest the line; update the count bf << str num += 1 end @@ -131,10 +131,10 @@ def to_s # test loop last = '' while str = $stdin.gets&.chomp - if str.empty? + if str.empty? # as before puts bf break if last.empty? - else + else # show the likelihood for each item and its index puts format("%04.1f%% %s \t %s", bf[str] * 100, str, bf.index(str)) end last = str @@ -148,10 +148,10 @@ def to_s # and now we can put stuff in the test loop: if false # nothing in here will execute, but check if we've seen these lines before - # 1. puts (yes) + # 1. puts (yes) # 2. ingest loop comment (yes) - # 3. test loop comment (yes) - # 4. end (yes) + # 3. test loop comment (yes) + # 4. end (yes) puts # ingest loop # test loop From 474daa102eb84d88becf2f262450414309296ec3 Mon Sep 17 00:00:00 2001 From: Rick Hull Date: Tue, 26 Mar 2024 18:21:11 -0400 Subject: [PATCH 4/4] unify interface - use #add(singular) and #include?(singluar) - remove #set(plural) and #set?(plural) --- examples/pure-ruby-bf.rb | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/examples/pure-ruby-bf.rb b/examples/pure-ruby-bf.rb index 228a98b..e21fa5d 100644 --- a/examples/pure-ruby-bf.rb +++ b/examples/pure-ruby-bf.rb @@ -5,6 +5,7 @@ class BitSet # in bits, e.g. 64 bit / 32 bit platforms. SIZEOF returns byte width INT_WIDTH = RbConfig::SIZEOF.fetch('long') * 8 + EVEN_BYTE = (2**INT_WIDTH / 3r).to_i # return an array of ones and zeroes, padded to INT_WIDTH def self.bits(int) @@ -17,32 +18,28 @@ def self.bits(int) # create an array of integers, default 0 # use flip_even_bits to initialize with every even bit set to 1 def initialize(num_bits, flip_even_bits: false) - init = flip_even_bits ? (2**INT_WIDTH / 3r).to_i : 0 - @storage = Array.new((num_bits / INT_WIDTH.to_f).ceil, init) + @storage = Array.new((num_bits / INT_WIDTH.to_f).ceil, + flip_even_bits ? EVEN_BYTE : 0) end - # ensure the given bit_indices are set to 1 - def set(bit_indices) - bit_indices.each { |b| - slot, val = b.divmod(INT_WIDTH) - @storage[slot] |= (1 << val) - } + # set the bit_index to 1 + def add(bit_index) + slot, val = bit_index.divmod(INT_WIDTH) + @storage[slot] |= (1 << val) end - # determine if all given bit indices are set to 1 - def set?(bit_indices) - bit_indices.all? { |b| - slot, val = b.divmod(INT_WIDTH) - @storage[slot][val] != 0 - } + # is the bit_index set to 1? + def include?(bit_index) + slot, val = bit_index.divmod(INT_WIDTH) + @storage[slot][val] != 0 end - # returns an array of ones and zeroes, padded to INT_WIDTH + # return an array of ones and zeroes, padded to INT_WIDTH def bits @storage.flat_map { |i| self.class.bits(i) } end - # returns an array of bit indices + # return an array of bit indices def on_bits self.bits.filter_map.with_index { |b, i| i if b == 1 } end @@ -71,13 +68,13 @@ def index(str) end def add(str) - @bitmap.set(self.index(str)) + self.index(str).each { |i| @bitmap.add(i) } end alias_method(:<<, :add) # true or false; a `true` result may be a "false positive" def include?(str) - @bitmap.set?(self.index(str)) + self.index(str).all? { |i| @bitmap.include?(i) } end # returns either 0 or a number like 0.95036573