From 7e6e75301e887e69f005b7b5855f44a1b3b5250a Mon Sep 17 00:00:00 2001 From: cdog1526 Date: Thu, 11 Dec 2025 02:30:57 -0500 Subject: [PATCH 1/5] Handling #7303: doing window selection to choose subset of row values to go in dense matrix --- utils/zerotrie/src/builder/dense.rs | 35 ++++++++++++++++++++-------- utils/zerotrie/tests/dense_test.rs | 36 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 9 deletions(-) diff --git a/utils/zerotrie/src/builder/dense.rs b/utils/zerotrie/src/builder/dense.rs index fb5128b957c..3d2fa47f466 100644 --- a/utils/zerotrie/src/builder/dense.rs +++ b/utils/zerotrie/src/builder/dense.rs @@ -29,6 +29,27 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { /// Add a prefix and all values associated with the prefix to the builder. + fn find_window( + values: &BTreeMap<&'a str, usize> + ) -> usize { + let mut sorted_vals: Vec = values.values().cloned().collect(); + let row_width = usize::from(DenseType::MAX); + sorted_vals.sort_unstable(); + let mut bot = 0; + let mut best = 0; + let mut best_index = 0; + for top in 0..sorted_vals.len() { + while bot <= top && sorted_vals[top] - sorted_vals[bot] >= row_width { + bot += 1; + } + if (top - bot + 1) > best { + best = top - bot + 1; + best_index = bot; + } + } + sorted_vals[best_index] + } + pub(crate) fn add_prefix( &mut self, prefix: &'a str, @@ -42,20 +63,17 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { max = core::cmp::max(max, *value); } // >= because DenseType::MAX is the sentinel + let mut row_value_offset = min; if max - min >= usize::from(DenseType::MAX) { - // TODO(#7303): How to implement this when we need it: - // 1. Select the row offset that gets the greatest number of values into the dense matrix. - // 2. Put all out-of-range values into the primary trie, as we do with sparse rows. - todo!("values in row are not in a sufficiently compact range"); + row_value_offset = Self::find_window(values); } - let sentinel = min + usize::from(DenseType::MAX); + let sentinel = row_value_offset + usize::from(DenseType::MAX); // Partition the entries based on whether they can be encoded as dense let (dense_or_sparse, sparse_only) = values .iter() .map(|(suffix, value)| (*suffix, *value)) - .partition::, _>(|(suffix, _value)| { - // TODO(#7303): Also filter for suffixes that are out of range of the dense row offset - self.suffixes.contains(suffix) + .partition::, _>(|(suffix, value)| { + self.suffixes.contains(suffix) && *value >= row_value_offset && *value < row_value_offset + usize::from(DenseType::MAX) }); // Check whether the sparse trie is smaller than the dense row let sub_trie = dense_or_sparse @@ -64,7 +82,6 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { .collect::>>(); if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::() { // Create a dense prefix - let row_value_offset = min; let offsets = self .suffixes .iter() diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs index 7815749f667..274af6cd0c3 100644 --- a/utils/zerotrie/tests/dense_test.rs +++ b/utils/zerotrie/tests/dense_test.rs @@ -137,3 +137,39 @@ fn test_short_subtags() { let simple_trie = make_simple_ascii_trie(&data); assert_eq!(simple_trie.byte_len(), 1099634); } + +#[test] +fn test_dense_sparse_window_selection() { + //Make initial Btree (not using enumerate) + let row_width = usize::from(u16::MAX); // Densetype max + let far_low = 0; + let cluster_start = 50; + let cluster_vals = [cluster_start, cluster_start + 2, cluster_start + 3]; + let far_high = cluster_start + row_width + 100; + + let mut inner = BTreeMap::new(); + inner.insert("low", far_low); + inner.insert("a", cluster_vals[0]); + inner.insert("b", cluster_vals[1]); + inner.insert("c", cluster_vals[2]); + inner.insert("high", far_high); + + let mut data = BTreeMap::new(); + data.insert("p", inner); + + // Build the 2d trie. + let dense = + ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap(); + let trie = dense.as_borrowed(); + + assert_eq!(trie.get("p", "a"), Some(cluster_vals[0])); + assert_eq!(trie.get("p", "b"), Some(cluster_vals[1])); + assert_eq!(trie.get("p", "c"), Some(cluster_vals[2])); + assert_eq!(trie.get("p", "low"), Some(far_low)); + assert_eq!(trie.get("p", "high"), Some(far_high)); + + let dense_size = check_encoding(trie); + let simple_size = make_simple_ascii_trie(&data).byte_len(); + + println!("Dense size: {}, Simple size: {}", dense_size, simple_size); +} From a89a176692cb0606f1b754b775b18e13c9941287 Mon Sep 17 00:00:00 2001 From: cdog1526 Date: Thu, 11 Dec 2025 02:36:44 -0500 Subject: [PATCH 2/5] Handling #7303: doing window selection to choose subset of row values to go in dense matrix --- utils/zerotrie/src/builder/dense.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/zerotrie/src/builder/dense.rs b/utils/zerotrie/src/builder/dense.rs index 3d2fa47f466..64bbf1673c1 100644 --- a/utils/zerotrie/src/builder/dense.rs +++ b/utils/zerotrie/src/builder/dense.rs @@ -28,7 +28,7 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { } impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { - /// Add a prefix and all values associated with the prefix to the builder. + ///Helper function: finds best row offset when value range too large for dense matrix fn find_window( values: &BTreeMap<&'a str, usize> ) -> usize { @@ -50,6 +50,7 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { sorted_vals[best_index] } + /// Add a prefix and all values associated with the prefix to the builder. pub(crate) fn add_prefix( &mut self, prefix: &'a str, From 80bdebea562eed21963a50c0b4ba458fe5d4e19a Mon Sep 17 00:00:00 2001 From: cdog1526 Date: Thu, 11 Dec 2025 12:40:36 -0500 Subject: [PATCH 3/5] Fixing array access --- utils/zerotrie/src/builder/dense.rs | 4 ++-- utils/zerotrie/tests/dense_test.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/utils/zerotrie/src/builder/dense.rs b/utils/zerotrie/src/builder/dense.rs index 64bbf1673c1..c205a15fdc8 100644 --- a/utils/zerotrie/src/builder/dense.rs +++ b/utils/zerotrie/src/builder/dense.rs @@ -39,7 +39,7 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { let mut best = 0; let mut best_index = 0; for top in 0..sorted_vals.len() { - while bot <= top && sorted_vals[top] - sorted_vals[bot] >= row_width { + while bot <= top && sorted_vals.get(top) - sorted_vals.get(bot) >= row_width { bot += 1; } if (top - bot + 1) > best { @@ -47,7 +47,7 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { best_index = bot; } } - sorted_vals[best_index] + sorted_vals.get(best_index) } /// Add a prefix and all values associated with the prefix to the builder. diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs index 274af6cd0c3..6257a02301f 100644 --- a/utils/zerotrie/tests/dense_test.rs +++ b/utils/zerotrie/tests/dense_test.rs @@ -149,9 +149,9 @@ fn test_dense_sparse_window_selection() { let mut inner = BTreeMap::new(); inner.insert("low", far_low); - inner.insert("a", cluster_vals[0]); - inner.insert("b", cluster_vals[1]); - inner.insert("c", cluster_vals[2]); + inner.insert("a", cluster_vals.get(0)); + inner.insert("b", cluster_vals.get(1)); + inner.insert("c", cluster_vals.get(2)); inner.insert("high", far_high); let mut data = BTreeMap::new(); @@ -162,9 +162,9 @@ fn test_dense_sparse_window_selection() { ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap(); let trie = dense.as_borrowed(); - assert_eq!(trie.get("p", "a"), Some(cluster_vals[0])); - assert_eq!(trie.get("p", "b"), Some(cluster_vals[1])); - assert_eq!(trie.get("p", "c"), Some(cluster_vals[2])); + assert_eq!(trie.get("p", "a"), Some(cluster_vals.get(0))); + assert_eq!(trie.get("p", "b"), Some(cluster_vals.get(1))); + assert_eq!(trie.get("p", "c"), Some(cluster_vals.get(2))); assert_eq!(trie.get("p", "low"), Some(far_low)); assert_eq!(trie.get("p", "high"), Some(far_high)); From daef8b522e3be3a8d24bbc393843437ab64c7cf6 Mon Sep 17 00:00:00 2001 From: cdog1526 Date: Fri, 12 Dec 2025 00:06:47 -0500 Subject: [PATCH 4/5] Updating based on feedback - fixing fmt errors and standardizing test --- utils/zerotrie/src/builder/dense.rs | 21 +++++++++++-------- utils/zerotrie/tests/dense_test.rs | 31 ++++++++++++++++------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/utils/zerotrie/src/builder/dense.rs b/utils/zerotrie/src/builder/dense.rs index c205a15fdc8..40c8f994f00 100644 --- a/utils/zerotrie/src/builder/dense.rs +++ b/utils/zerotrie/src/builder/dense.rs @@ -29,9 +29,7 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { ///Helper function: finds best row offset when value range too large for dense matrix - fn find_window( - values: &BTreeMap<&'a str, usize> - ) -> usize { + fn find_window(values: &BTreeMap<&'a str, usize>) -> usize { let mut sorted_vals: Vec = values.values().cloned().collect(); let row_width = usize::from(DenseType::MAX); sorted_vals.sort_unstable(); @@ -39,17 +37,22 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { let mut best = 0; let mut best_index = 0; for top in 0..sorted_vals.len() { - while bot <= top && sorted_vals.get(top) - sorted_vals.get(bot) >= row_width { - bot += 1; + while bot < top { + let top_val = sorted_vals.get(top).copied().unwrap_or(0); + let bot_val = sorted_vals.get(bot).copied().unwrap_or(0); + if top_val - bot_val >= row_width { + bot += 1; + } else { + break; + } } if (top - bot + 1) > best { best = top - bot + 1; best_index = bot; } } - sorted_vals.get(best_index) + sorted_vals.get(best_index).copied().unwrap_or(0) } - /// Add a prefix and all values associated with the prefix to the builder. pub(crate) fn add_prefix( &mut self, @@ -74,7 +77,9 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> { .iter() .map(|(suffix, value)| (*suffix, *value)) .partition::, _>(|(suffix, value)| { - self.suffixes.contains(suffix) && *value >= row_value_offset && *value < row_value_offset + usize::from(DenseType::MAX) + self.suffixes.contains(suffix) + && *value >= row_value_offset + && *value < row_value_offset + usize::from(DenseType::MAX) }); // Check whether the sparse trie is smaller than the dense row let sub_trie = dense_or_sparse diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs index 6257a02301f..80208c5fbb8 100644 --- a/utils/zerotrie/tests/dense_test.rs +++ b/utils/zerotrie/tests/dense_test.rs @@ -149,27 +149,30 @@ fn test_dense_sparse_window_selection() { let mut inner = BTreeMap::new(); inner.insert("low", far_low); - inner.insert("a", cluster_vals.get(0)); - inner.insert("b", cluster_vals.get(1)); - inner.insert("c", cluster_vals.get(2)); + inner.insert("a", cluster_vals.get(0).copied().unwrap_or(0)); + inner.insert("c", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("d", cluster_start + row_width - 3); + inner.insert("e", cluster_start + row_width - 2); + inner.insert("f", cluster_start + row_width - 1); + inner.insert("c2", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("g", cluster_start + row_width); + inner.insert("h", cluster_start + row_width + 1); + inner.insert("b", cluster_vals.get(1).copied().unwrap_or(0)); inner.insert("high", far_high); + inner.insert("c3", cluster_vals.get(2).copied().unwrap_or(0)); + inner.insert("low2", far_low); let mut data = BTreeMap::new(); data.insert("p", inner); // Build the 2d trie. - let dense = - ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap(); + let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap(); let trie = dense.as_borrowed(); - assert_eq!(trie.get("p", "a"), Some(cluster_vals.get(0))); - assert_eq!(trie.get("p", "b"), Some(cluster_vals.get(1))); - assert_eq!(trie.get("p", "c"), Some(cluster_vals.get(2))); - assert_eq!(trie.get("p", "low"), Some(far_low)); - assert_eq!(trie.get("p", "high"), Some(far_high)); - - let dense_size = check_encoding(trie); - let simple_size = make_simple_ascii_trie(&data).byte_len(); + check_data(&data, trie, true); - println!("Dense size: {}, Simple size: {}", dense_size, simple_size); + let byte_len = check_encoding(dense.as_borrowed()); + assert_eq!(byte_len, 102); + let simple_trie = make_simple_ascii_trie(&data); + assert_eq!(simple_trie.byte_len(), 60); } From 4d1ec1055b3c51ee82b4d5b6fbe24f007f5d7dcf Mon Sep 17 00:00:00 2001 From: cdog1526 Date: Fri, 12 Dec 2025 18:38:59 -0500 Subject: [PATCH 5/5] Using first() instead of get(0) --- utils/zerotrie/tests/dense_test.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs index 80208c5fbb8..a8513f103c4 100644 --- a/utils/zerotrie/tests/dense_test.rs +++ b/utils/zerotrie/tests/dense_test.rs @@ -149,7 +149,7 @@ fn test_dense_sparse_window_selection() { let mut inner = BTreeMap::new(); inner.insert("low", far_low); - inner.insert("a", cluster_vals.get(0).copied().unwrap_or(0)); + inner.insert("a", cluster_vals.first().copied().unwrap_or(0)); inner.insert("c", cluster_vals.get(2).copied().unwrap_or(0)); inner.insert("d", cluster_start + row_width - 3); inner.insert("e", cluster_start + row_width - 2);