Skip to content
39 changes: 31 additions & 8 deletions utils/zerotrie/src/builder/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,31 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
}

impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
///Helper function: finds best row offset when value range too large for dense matrix
fn find_window(values: &BTreeMap<&'a str, usize>) -> usize {
let mut sorted_vals: Vec<usize> = values.values().cloned().collect();
let row_width = usize::from(DenseType::MAX);
sorted_vals.sort_unstable();
let mut bot = 0;
let mut best = 0;
let mut best_index = 0;
for top in 0..sorted_vals.len() {
while bot < top {
let top_val = sorted_vals.get(top).copied().unwrap_or(0);
let bot_val = sorted_vals.get(bot).copied().unwrap_or(0);
if top_val - bot_val >= row_width {
bot += 1;
} else {
break;
}
}
if (top - bot + 1) > best {
best = top - bot + 1;
best_index = bot;
}
}
sorted_vals.get(best_index).copied().unwrap_or(0)
}
/// Add a prefix and all values associated with the prefix to the builder.
pub(crate) fn add_prefix(
&mut self,
Expand All @@ -42,20 +67,19 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
max = core::cmp::max(max, *value);
}
// >= because DenseType::MAX is the sentinel
let mut row_value_offset = min;
if max - min >= usize::from(DenseType::MAX) {
// TODO(#7303): How to implement this when we need it:
// 1. Select the row offset that gets the greatest number of values into the dense matrix.
// 2. Put all out-of-range values into the primary trie, as we do with sparse rows.
todo!("values in row are not in a sufficiently compact range");
row_value_offset = Self::find_window(values);
}
let sentinel = min + usize::from(DenseType::MAX);
let sentinel = row_value_offset + usize::from(DenseType::MAX);
// Partition the entries based on whether they can be encoded as dense
let (dense_or_sparse, sparse_only) = values
.iter()
.map(|(suffix, value)| (*suffix, *value))
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, _value)| {
// TODO(#7303): Also filter for suffixes that are out of range of the dense row offset
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, value)| {
self.suffixes.contains(suffix)
&& *value >= row_value_offset
&& *value < row_value_offset + usize::from(DenseType::MAX)
});
// Check whether the sparse trie is smaller than the dense row
let sub_trie = dense_or_sparse
Expand All @@ -64,7 +88,6 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
.collect::<ZeroTrieSimpleAscii<Vec<u8>>>();
if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::<DenseType>() {
// Create a dense prefix
let row_value_offset = min;
let offsets = self
.suffixes
.iter()
Expand Down
39 changes: 39 additions & 0 deletions utils/zerotrie/tests/dense_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,42 @@ fn test_short_subtags() {
let simple_trie = make_simple_ascii_trie(&data);
assert_eq!(simple_trie.byte_len(), 1099634);
}

#[test]
fn test_dense_sparse_window_selection() {
//Make initial Btree (not using enumerate)
let row_width = usize::from(u16::MAX); // Densetype max
let far_low = 0;
let cluster_start = 50;
let cluster_vals = [cluster_start, cluster_start + 2, cluster_start + 3];
let far_high = cluster_start + row_width + 100;

let mut inner = BTreeMap::new();
inner.insert("low", far_low);
inner.insert("a", cluster_vals.first().copied().unwrap_or(0));
inner.insert("c", cluster_vals.get(2).copied().unwrap_or(0));
inner.insert("d", cluster_start + row_width - 3);
inner.insert("e", cluster_start + row_width - 2);
inner.insert("f", cluster_start + row_width - 1);
inner.insert("c2", cluster_vals.get(2).copied().unwrap_or(0));
inner.insert("g", cluster_start + row_width);
inner.insert("h", cluster_start + row_width + 1);
inner.insert("b", cluster_vals.get(1).copied().unwrap_or(0));
inner.insert("high", far_high);
inner.insert("c3", cluster_vals.get(2).copied().unwrap_or(0));
inner.insert("low2", far_low);

let mut data = BTreeMap::new();
data.insert("p", inner);

// Build the 2d trie.
let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();
let trie = dense.as_borrowed();

check_data(&data, trie, true);

let byte_len = check_encoding(dense.as_borrowed());
assert_eq!(byte_len, 102);
let simple_trie = make_simple_ascii_trie(&data);
assert_eq!(simple_trie.byte_len(), 60);
}
Loading