@@ -170,6 +170,7 @@ static constexpr std::size_t kFeaturesFlagLength{sizeof(uint8_t)};
170170static constexpr std::size_t kGolombParamSizeLength {sizeof (uint32_t )}; // Erigon writes 4-instead-of-2 bytes
171171static constexpr std::size_t kEliasFano32CountLength {sizeof (uint64_t )};
172172static constexpr std::size_t kEliasFano32ULength {sizeof (uint64_t )};
173+ static constexpr std::size_t kExistenceFilterSizeLength {sizeof (uint64_t )};
173174
174175// ! Size in bytes of 2nd fixed metadata header in RecSplit-encoded file
175176constexpr std::size_t kSecondMetadataHeaderLength {
@@ -181,7 +182,8 @@ struct RecSplitSettings {
181182 uint16_t bucket_size; // The number of keys in each bucket (except probably last one)
182183 std::filesystem::path index_path; // The path of the generated RecSplit index file
183184 uint64_t base_data_id; // Application-specific base data ID written in index header
184- bool double_enum_index{true }; // Flag indicating if 2-level index is required
185+ bool double_enum_index{true }; // Flag indicating if 2-layer index is required
186+ bool less_false_positives{false }; // Flag indicating if existence filter to reduce false-positives is required
185187};
186188
187189template <typename T>
@@ -210,14 +212,15 @@ constexpr auto operator~(const T t) {
210212}
211213
212214enum class RecSplitFeatures : uint8_t {
213- kNone = 0b0 , // no specific feature
214- kEnums = 0b1 , // 2-level index with PHT pointing to enumeration and enumeration pointing to offsets
215+ kNone = 0b0 , // no specific feature
216+ kEnums = 0b1 , // 2-layer index with PHT pointing to enumeration and enumeration pointing to offsets
217+ kLessFalsePositives = 0b10 , // reduce false-positives to 1/256=0.4% at the cost of 1byte per key
215218};
216219consteval void enable_bitmask_operator_and (RecSplitFeatures);
217220consteval void enable_bitmask_operator_or (RecSplitFeatures);
218221consteval void enable_bitmask_operator_not (RecSplitFeatures);
219222
220- constexpr std::array kSupportedFeatures {RecSplitFeatures::kEnums };
223+ constexpr std::array kSupportedFeatures {RecSplitFeatures::kEnums , RecSplitFeatures:: kLessFalsePositives };
221224
222225// ! Recursive splitting (RecSplit) is an efficient algorithm to identify minimal perfect hash functions.
223226// ! The template parameter LEAF_SIZE decides how large a leaf will be. Larger leaves imply slower construction, but less
@@ -319,9 +322,10 @@ class RecSplit {
319322 const RecSplitFeatures features{(address + offset)[0 ]};
320323 check_supported_features (features);
321324 double_enum_index_ = (features & RecSplitFeatures::kEnums ) != RecSplitFeatures::kNone ;
325+ less_false_positives_ = (features & RecSplitFeatures::kLessFalsePositives ) != RecSplitFeatures::kNone ;
322326 offset += kFeaturesFlagLength ;
323327
324- if (double_enum_index_) {
328+ if (double_enum_index_ && key_count_ > 0 ) {
325329 check_minimum_length (offset + kEliasFano32CountLength + kEliasFano32ULength );
326330
327331 // Read Elias-Fano index for offsets
@@ -332,6 +336,21 @@ class RecSplit {
332336 std::span<uint8_t > remaining_data{address + offset, encoded_file_->length () - offset};
333337 ef_offsets_ = std::make_unique<EliasFano>(count, u, remaining_data);
334338 offset += ef_offsets_->data ().size () * sizeof (uint64_t );
339+
340+ if (less_false_positives_) {
341+ // Read 1-byte-per-key existence filter used to reduce false positives
342+ const uint64_t filter_size = endian::load_big_u64 (address + offset);
343+ offset += kExistenceFilterSizeLength ;
344+ if (filter_size != key_count_) {
345+ throw std::runtime_error{
346+ " Incompatible index format: existence filter length " + std::to_string (filter_size) +
347+ " != key count " + std::to_string (key_count_)};
348+ }
349+ std::span<uint8_t > filter_data{address + offset, filter_size};
350+ existence_filter_.resize (filter_size);
351+ std::copy (filter_data.begin (), filter_data.end (), existence_filter_.data ());
352+ offset += filter_size;
353+ }
335354 }
336355
337356 // Read the number of Golomb-Rice code params
@@ -496,11 +515,19 @@ class RecSplit {
496515 hasher_->reset_seed (salt_);
497516 }
498517
499- /* * Return the value associated with the given 128-bit hash.
500- * Note that this method is mainly useful for benchmarking.
501- * @param hash a 128-bit hash.
502- * @return the associated value.
503- */
518+ // ! Check if the given bucket hash is present as i-th element in the index
519+ // ! \return true if hash is present as i-th element, false otherwise
520+ bool has (const hash128_t & hash, std::size_t i) const {
521+ if (less_false_positives_ && i < existence_filter_.size ()) {
522+ return existence_filter_.at (i) == static_cast <uint8_t >(hash.first );
523+ }
524+ // If existence filter not applicable, default is true: MPHF has no presence indicator
525+ return true ;
526+ }
527+
528+ // ! Return the value associated with the given 128-bit bucket hash
529+ // ! \param hash a 128-bit bucket hash
530+ // ! \return the associated value
504531 std::size_t operator ()(const hash128_t & hash) const {
505532 ensure (built_, " RecSplit: perfect hash function not built yet" );
506533 ensure (key_count_ > 0 , " RecSplit: invalid lookup with zero keys, use empty() to guard" );
@@ -592,6 +619,12 @@ class RecSplit {
592619 [[nodiscard]] uint64_t bucket_count () const { return bucket_count_; }
593620 [[nodiscard]] uint16_t bucket_size () const { return bucket_size_; }
594621
622+ [[nodiscard]] bool double_enum_index () const { return double_enum_index_; }
623+ [[nodiscard]] bool less_false_positives () const { return less_false_positives_; }
624+
625+ // ! Return the presence filter for the index. It can be empty if less false-positives feature is not enabled
626+ [[nodiscard]] std::vector<uint8_t > existence_filter () const { return existence_filter_; }
627+
595628 [[nodiscard]] std::size_t file_size () const { return std::filesystem::file_size (index_path_); }
596629
597630 [[nodiscard]] std::filesystem::file_time_type last_write_time () const {
@@ -617,10 +650,9 @@ class RecSplit {
617650 return golomb_param (m, memo);
618651 }
619652
620- // Generates the precomputed table of 32-bit values holding the Golomb-Rice code
621- // of a splitting (upper 5 bits), the number of nodes in the associated subtree
622- // (following 11 bits) and the sum of the Golomb-Rice code lengths in the same
623- // subtree (lower 16 bits).
653+ // ! Generate the precomputed table of 32-bit values holding the Golomb-Rice code of a splitting (upper 5 bits),
654+ // ! the number of nodes in the associated subtree (following 11 bits) and the sum of the Golomb-Rice code lengths
655+ // ! in the same subtree (lower 16 bits)
624656 static constexpr void precompute_golomb_rice (const int m, std::array<uint32_t , kMaxBucketSize >* memo) {
625657 std::array<std::size_t , kMaxFanout > k{0 };
626658
@@ -638,7 +670,8 @@ class RecSplit {
638670 }
639671
640672 const double p = sqrt (m) / (pow (2 * std::numbers::pi, (static_cast <double >(fanout) - 1 .) * 0.5 ) * sqrt_prod);
641- auto golomb_rice_length = math::int_ceil<uint32_t >(log2 (-std::log ((sqrt (5 ) + 1 ) * 0.5 ) / log1p (-p))); // log2 Golomb modulus
673+ std::integral auto golomb_rice_length =
674+ math::int_ceil<uint32_t >(log2 (-std::log ((sqrt (5 ) + 1 ) * 0.5 ) / log1p (-p))); // log2 Golomb modulus
642675
643676 SILKWORM_ASSERT (golomb_rice_length <= 0x1F ); // Golomb-Rice code, stored in the 5 upper bits
644677 (*memo)[m] = golomb_rice_length << 27 ;
@@ -669,28 +702,28 @@ class RecSplit {
669702 }
670703
671704 // ! Apply the RecSplit algorithm to the given bucket
672- template <typename GRBUILDER >
705+ template <typename GRBuilder >
673706 static void recsplit (std::vector<uint64_t >& keys,
674707 std::vector<uint64_t >& offsets,
675708 std::vector<uint64_t >& buffer_keys, // temporary buffer for keys
676709 std::vector<uint64_t >& buffer_offsets, // temporary buffer for offsets
677- GRBUILDER & gr_builder,
710+ GRBuilder & gr_builder,
678711 std::ostream& index_ofs,
679712 uint16_t & golomb_param_max_index,
680713 uint8_t bytes_per_record) {
681714 recsplit (/* .level=*/ 0 , keys, offsets, buffer_keys, buffer_offsets, /* .start=*/ 0 , /* .end=*/ keys.size (),
682715 gr_builder, index_ofs, golomb_param_max_index, bytes_per_record);
683716 }
684717
685- template <typename GRBUILDER >
718+ template <typename GRBuilder >
686719 static void recsplit (int level, // NOLINT
687720 std::vector<uint64_t >& keys,
688721 std::vector<uint64_t >& offsets, // aka values
689722 std::vector<uint64_t >& buffer_keys, // temporary buffer for keys
690723 std::vector<uint64_t >& buffer_offsets, // temporary buffer for offsets
691724 std::size_t start,
692725 std::size_t end,
693- GRBUILDER & gr_builder,
726+ GRBuilder & gr_builder,
694727 std::ostream& index_ofs,
695728 uint16_t & golomb_param_max_index,
696729 uint8_t bytes_per_record) {
@@ -795,7 +828,7 @@ class RecSplit {
795828 return h;
796829 }
797830
798- // Maps a 128-bit to a bucket using the first 64-bit half.
831+ // ! Maps a 128-bit to a bucket using the first 64-bit half
799832 [[nodiscard]] inline uint64_t hash128_to_bucket (const hash128_t & hash) const { return remap128 (hash.first , bucket_count_); }
800833
801834 void check_minimum_length (std::size_t minimum_length) {
@@ -867,7 +900,7 @@ class RecSplit {
867900 // ! Helper to encode the sequences of key offsets in the single EF code
868901 std::unique_ptr<EliasFano> ef_offsets_;
869902
870- // ! Minimal app-specific ID of entries of this index - helps app understand what data stored in given shard - persistent field
903+ // ! Minimal app-specific ID of entries in this index - helps understanding what data stored in given shard - persistent field
871904 uint64_t base_data_id_;
872905
873906 // ! The path of the index file generated
@@ -879,9 +912,15 @@ class RecSplit {
879912 // ! The bitmask to be used to interpret record data
880913 uint64_t record_mask_{0 };
881914
882- // ! Flag indicating if two-level index "recsplit -> enum" + "enum -> offset" is required
915+ // ! Flag indicating if two-level index "recsplit -> enum" + "enum -> offset" is enabled or not
883916 bool double_enum_index_{true };
884917
918+ // ! Flag indicating if less false-positives feature is enabled or not
919+ bool less_false_positives_{false };
920+
921+ // ! The 1-byte per key positional existence filter used to have less false-positives
922+ std::vector<uint8_t > existence_filter_;
923+
885924 // ! Flag indicating that the MPHF has been built and no more keys can be added
886925 bool built_{false };
887926
0 commit comments