Skip to content
36 changes: 27 additions & 9 deletions utils/zerotrie/src/builder/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
}

impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
///Helper function: finds best row offset when value range too large for dense matrix
fn find_window(
values: &BTreeMap<&'a str, usize>
) -> usize {
let mut sorted_vals: Vec<usize> = values.values().cloned().collect();
let row_width = usize::from(DenseType::MAX);
sorted_vals.sort_unstable();
let mut bot = 0;
let mut best = 0;
let mut best_index = 0;
for top in 0..sorted_vals.len() {
while bot <= top && sorted_vals.get(top) - sorted_vals.get(bot) >= row_width {
bot += 1;
}
if (top - bot + 1) > best {
best = top - bot + 1;
best_index = bot;
}
}
sorted_vals.get(best_index)
}

/// Add a prefix and all values associated with the prefix to the builder.
pub(crate) fn add_prefix(
&mut self,
Expand All @@ -42,20 +64,17 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
max = core::cmp::max(max, *value);
}
// >= because DenseType::MAX is the sentinel
let mut row_value_offset = min;
if max - min >= usize::from(DenseType::MAX) {
// TODO(#7303): How to implement this when we need it:
// 1. Select the row offset that gets the greatest number of values into the dense matrix.
// 2. Put all out-of-range values into the primary trie, as we do with sparse rows.
todo!("values in row are not in a sufficiently compact range");
row_value_offset = Self::find_window(values);
}
let sentinel = min + usize::from(DenseType::MAX);
let sentinel = row_value_offset + usize::from(DenseType::MAX);
// Partition the entries based on whether they can be encoded as dense
let (dense_or_sparse, sparse_only) = values
.iter()
.map(|(suffix, value)| (*suffix, *value))
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, _value)| {
// TODO(#7303): Also filter for suffixes that are out of range of the dense row offset
self.suffixes.contains(suffix)
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, value)| {
self.suffixes.contains(suffix) && *value >= row_value_offset && *value < row_value_offset + usize::from(DenseType::MAX)
});
// Check whether the sparse trie is smaller than the dense row
let sub_trie = dense_or_sparse
Expand All @@ -64,7 +83,6 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
.collect::<ZeroTrieSimpleAscii<Vec<u8>>>();
if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::<DenseType>() {
// Create a dense prefix
let row_value_offset = min;
let offsets = self
.suffixes
.iter()
Expand Down
36 changes: 36 additions & 0 deletions utils/zerotrie/tests/dense_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,39 @@ fn test_short_subtags() {
let simple_trie = make_simple_ascii_trie(&data);
assert_eq!(simple_trie.byte_len(), 1099634);
}

#[test]
fn test_dense_sparse_window_selection() {
//Make initial Btree (not using enumerate)
let row_width = usize::from(u16::MAX); // Densetype max
let far_low = 0;
let cluster_start = 50;
let cluster_vals = [cluster_start, cluster_start + 2, cluster_start + 3];
let far_high = cluster_start + row_width + 100;

let mut inner = BTreeMap::new();
inner.insert("low", far_low);
inner.insert("a", cluster_vals.get(0));
inner.insert("b", cluster_vals.get(1));
inner.insert("c", cluster_vals.get(2));
inner.insert("high", far_high);

let mut data = BTreeMap::new();
data.insert("p", inner);

// Build the 2d trie.
let dense =
ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();
let trie = dense.as_borrowed();

assert_eq!(trie.get("p", "a"), Some(cluster_vals.get(0)));
assert_eq!(trie.get("p", "b"), Some(cluster_vals.get(1)));
assert_eq!(trie.get("p", "c"), Some(cluster_vals.get(2)));
assert_eq!(trie.get("p", "low"), Some(far_low));
assert_eq!(trie.get("p", "high"), Some(far_high));

let dense_size = check_encoding(trie);
let simple_size = make_simple_ascii_trie(&data).byte_len();

println!("Dense size: {}, Simple size: {}", dense_size, simple_size);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add assertions on the two sizes. We assert the sizes because they should be stable, and because you can see what they are from reading the code. They should only change if we change the data structure or builder algorithm.

}
Loading