Skip to content

Commit 844910b

Browse files
Cdog1526sffc
andauthored
Dense zero trie Out of Range handling #7303 (#7305)
Working on #7303. When creating a dense sparse zerotrie, if the incoming values are too spread out for a dense matrix: - Select the row offset that gets the greatest number of values into the dense matrix. - Put all out-of-range values into the primary trie, as we do with sparse rows. Also, added a test case for this. I am not certain that this test case is sufficient, this is my first contribution here. <!-- Thank you for your pull request to ICU4X! Reminder: try to use [Conventional Comments](https://conventionalcomments.org/) to make comments clearer. Please see https://github.com/unicode-org/icu4x/blob/main/CONTRIBUTING.md for general information on contributing to ICU4X. --> --------- Co-authored-by: Shane F. Carr <shane@unicode.org>
1 parent 9725af9 commit 844910b

File tree

2 files changed

+70
-8
lines changed

2 files changed

+70
-8
lines changed

utils/zerotrie/src/builder/dense.rs

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,32 @@ pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
2828
}
2929

3030
impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
31+
/// Helper function: finds best row offset when value range too large for dense matrix
32+
#[allow(clippy::indexing_slicing)] // bot < top and top < sorted_vals.len()
33+
fn find_window(values: &BTreeMap<&'a str, usize>) -> usize {
34+
let mut sorted_vals: Vec<usize> = values.values().cloned().collect();
35+
let row_width = usize::from(DenseType::MAX);
36+
sorted_vals.sort_unstable();
37+
let mut bot = 0;
38+
let mut best = 0;
39+
let mut best_index = 0;
40+
for top in 0..sorted_vals.len() {
41+
while bot < top {
42+
let top_val = sorted_vals[top];
43+
let bot_val = sorted_vals[bot];
44+
if top_val - bot_val >= row_width {
45+
bot += 1;
46+
} else {
47+
break;
48+
}
49+
}
50+
if (top - bot + 1) > best {
51+
best = top - bot + 1;
52+
best_index = bot;
53+
}
54+
}
55+
sorted_vals[best_index]
56+
}
3157
/// Add a prefix and all values associated with the prefix to the builder.
3258
pub(crate) fn add_prefix(
3359
&mut self,
@@ -42,20 +68,19 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
4268
max = core::cmp::max(max, *value);
4369
}
4470
// >= because DenseType::MAX is the sentinel
71+
let mut row_value_offset = min;
4572
if max - min >= usize::from(DenseType::MAX) {
46-
// TODO(#7303): How to implement this when we need it:
47-
// 1. Select the row offset that gets the greatest number of values into the dense matrix.
48-
// 2. Put all out-of-range values into the primary trie, as we do with sparse rows.
49-
todo!("values in row are not in a sufficiently compact range");
73+
row_value_offset = Self::find_window(values);
5074
}
51-
let sentinel = min + usize::from(DenseType::MAX);
75+
let sentinel = row_value_offset + usize::from(DenseType::MAX);
5276
// Partition the entries based on whether they can be encoded as dense
5377
let (dense_or_sparse, sparse_only) = values
5478
.iter()
5579
.map(|(suffix, value)| (*suffix, *value))
56-
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, _value)| {
57-
// TODO(#7303): Also filter for suffixes that are out of range of the dense row offset
80+
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, value)| {
5881
self.suffixes.contains(suffix)
82+
&& *value >= row_value_offset
83+
&& *value < row_value_offset + usize::from(DenseType::MAX)
5984
});
6085
// Check whether the sparse trie is smaller than the dense row
6186
let sub_trie = dense_or_sparse
@@ -64,7 +89,6 @@ impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
6489
.collect::<ZeroTrieSimpleAscii<Vec<u8>>>();
6590
if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::<DenseType>() {
6691
// Create a dense prefix
67-
let row_value_offset = min;
6892
let offsets = self
6993
.suffixes
7094
.iter()

utils/zerotrie/tests/dense_test.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,3 +137,41 @@ fn test_short_subtags() {
137137
let simple_trie = make_simple_ascii_trie(&data);
138138
assert_eq!(simple_trie.byte_len(), 1099634);
139139
}
140+
141+
#[test]
142+
fn test_dense_sparse_window_selection() {
143+
//Make initial Btree (not using enumerate)
144+
let row_width = usize::from(u16::MAX); // Densetype max
145+
let far_low = 0;
146+
let cluster_start = 50;
147+
let far_high = cluster_start + row_width + 100;
148+
149+
let mut inner = BTreeMap::new();
150+
inner.insert("low", far_low);
151+
inner.insert("a", 50); // cluster_start
152+
inner.insert("c", 53); // cluster_start + 3
153+
inner.insert("d", cluster_start + row_width - 3);
154+
inner.insert("e", cluster_start + row_width - 2);
155+
inner.insert("f", cluster_start + row_width - 1);
156+
inner.insert("c2", 53); // cluster_start + 3
157+
inner.insert("g", cluster_start + row_width);
158+
inner.insert("h", cluster_start + row_width + 1);
159+
inner.insert("b", 52); // cluster_start + 2
160+
inner.insert("high", far_high);
161+
inner.insert("c3", 53); // cluster_start + 3
162+
inner.insert("low2", far_low);
163+
164+
let mut data = BTreeMap::new();
165+
data.insert("p", inner);
166+
167+
// Build the 2d trie.
168+
let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();
169+
let trie = dense.as_borrowed();
170+
171+
check_data(&data, trie, true);
172+
173+
let byte_len = check_encoding(dense.as_borrowed());
174+
assert_eq!(byte_len, 102);
175+
let simple_trie = make_simple_ascii_trie(&data);
176+
assert_eq!(simple_trie.byte_len(), 60);
177+
}

0 commit comments

Comments
 (0)