diff --git a/utils/zerotrie/src/builder/dense.rs b/utils/zerotrie/src/builder/dense.rs index fb5128b957c..40c8f994f00 100644 --- a/utils/zerotrie/src/builder/dense.rs +++ b/utils/zerotrie/src/builder/dense.rs @@ -28,6 +28,31 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { } impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { + ///Helper function: finds best row offset when value range too large for dense matrix + fn find_window(values: &BTreeMap<&'a str, usize>) -> usize { + let mut sorted_vals: Vec = values.values().cloned().collect(); + let row_width = usize::from(DenseType::MAX); + sorted_vals.sort_unstable(); + let mut bot = 0; + let mut best = 0; + let mut best_index = 0; + for top in 0..sorted_vals.len() { + while bot < top { + let top_val = sorted_vals.get(top).copied().unwrap_or(0); + let bot_val = sorted_vals.get(bot).copied().unwrap_or(0); + if top_val - bot_val >= row_width { + bot += 1; + } else { + break; + } + } + if (top - bot + 1) > best { + best = top - bot + 1; + best_index = bot; + } + } + sorted_vals.get(best_index).copied().unwrap_or(0) + } /// Add a prefix and all values associated with the prefix to the builder. pub(crate) fn add_prefix( &mut self, @@ -42,20 +67,19 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { max = core::cmp::max(max, *value); } // >= because DenseType::MAX is the sentinel + let mut row_value_offset = min; if max - min >= usize::from(DenseType::MAX) { - // TODO(#7303): How to implement this when we need it: - // 1. Select the row offset that gets the greatest number of values into the dense matrix. - // 2. Put all out-of-range values into the primary trie, as we do with sparse rows. - todo!("values in row are not in a sufficiently compact range"); + row_value_offset = Self::find_window(values); } - let sentinel = min + usize::from(DenseType::MAX); + let sentinel = row_value_offset + usize::from(DenseType::MAX); // Partition the entries based on whether they can be encoded as dense let (dense_or_sparse, sparse_only) = values .iter() .map(|(suffix, value)| (*suffix, *value)) - .partition::, _>(|(suffix, _value)| { - // TODO(#7303): Also filter for suffixes that are out of range of the dense row offset + .partition::, _>(|(suffix, value)| { self.suffixes.contains(suffix) + && *value >= row_value_offset + && *value < row_value_offset + usize::from(DenseType::MAX) }); // Check whether the sparse trie is smaller than the dense row let sub_trie = dense_or_sparse @@ -64,7 +88,6 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { .collect::>>(); if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::() { // Create a dense prefix - let row_value_offset = min; let offsets = self .suffixes .iter() diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs index 7815749f667..a8513f103c4 100644 --- a/utils/zerotrie/tests/dense_test.rs +++ b/utils/zerotrie/tests/dense_test.rs @@ -137,3 +137,42 @@ fn test_short_subtags() { let simple_trie = make_simple_ascii_trie(&data); assert_eq!(simple_trie.byte_len(), 1099634); } + +#[test] +fn test_dense_sparse_window_selection() { + //Make initial Btree (not using enumerate) + let row_width = usize::from(u16::MAX); // Densetype max + let far_low = 0; + let cluster_start = 50; + let cluster_vals = [cluster_start, cluster_start + 2, cluster_start + 3]; + let far_high = cluster_start + row_width + 100; + + let mut inner = BTreeMap::new(); + inner.insert("low", far_low); + inner.insert("a", cluster_vals.first().copied().unwrap_or(0)); + inner.insert("c", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("d", cluster_start + row_width - 3); + inner.insert("e", cluster_start + row_width - 2); + inner.insert("f", cluster_start + row_width - 1); + inner.insert("c2", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("g", cluster_start + row_width); + inner.insert("h", cluster_start + row_width + 1); + inner.insert("b", cluster_vals.get(1).copied().unwrap_or(0)); + inner.insert("high", far_high); + inner.insert("c3", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("low2", far_low); + + let mut data = BTreeMap::new(); + data.insert("p", inner); + + // Build the 2d trie. + let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap(); + let trie = dense.as_borrowed(); + + check_data(&data, trie, true); + + let byte_len = check_encoding(dense.as_borrowed()); + assert_eq!(byte_len, 102); + let simple_trie = make_simple_ascii_trie(&data); + assert_eq!(simple_trie.byte_len(), 60); +}