Skip to content

Commit b57d797

Browse files
authored
Add ZeroAsciiDenseSparse2dTrie for more efficient storage of data keys with many attributes (#7264)
#6920
1 parent b2ec500 commit b57d797

File tree

11 files changed

+1650
-2
lines changed

11 files changed

+1650
-2
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utils/host_info/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utils/zerotrie/Cargo.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ bincode = { workspace = true }
3333
icu_benchmark_macros = { path = "../../tools/benchmark/macros" }
3434
litemap = { path = "../../utils/litemap" }
3535
postcard = { workspace = true, features = ["alloc"] }
36+
itertools = { workspace = true }
3637
rand = { workspace = true }
3738
rand_pcg = { workspace = true }
3839
rmp-serde = { workspace = true }
@@ -50,12 +51,14 @@ bench = false # This option is required for Benchmark CI
5051

5152
[features]
5253
default = []
53-
alloc = []
54+
alloc = ["zerovec?/alloc"]
5455
databake = ["dep:databake", "zerovec?/databake"]
56+
dense = ["dep:zerovec"]
5557
litemap = ["dep:litemap", "alloc"]
5658
serde = ["dep:serde_core", "dep:litemap", "alloc", "litemap/serde", "zerovec?/serde"]
5759
yoke = ["dep:yoke"]
5860
zerofrom = ["dep:zerofrom"]
61+
zerovec = ["dep:zerovec"]
5962

6063
[[bench]]
6164
name = "overview"
@@ -70,6 +73,10 @@ required-features = ["alloc", "litemap"]
7073
name = "builder_test"
7174
required-features = ["alloc", "litemap"]
7275

76+
[[test]]
77+
name = "dense_test"
78+
required-features = ["alloc", "dense"]
79+
7380
[[test]]
7481
name = "locale_aux_test"
7582
required-features = ["alloc", "litemap"]
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
// This file is part of ICU4X. For terms of use, please see the file
2+
// called LICENSE at the top level of the ICU4X source tree
3+
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4+
5+
use crate::dense::DenseType;
6+
use crate::dense::ZeroAsciiDenseSparse2dTrieOwned;
7+
use crate::ZeroTrieBuildError;
8+
use crate::ZeroTrieSimpleAscii;
9+
use alloc::collections::BTreeMap;
10+
use alloc::collections::BTreeSet;
11+
use alloc::string::String;
12+
use alloc::vec::Vec;
13+
use zerovec::ZeroVec;
14+
15+
#[derive(PartialEq, Eq, PartialOrd, Ord)]
16+
struct Row<'a> {
17+
prefix: &'a str,
18+
row_value_offset: usize,
19+
offsets: Vec<DenseType>,
20+
}
21+
22+
#[derive(Default)]
23+
pub(crate) struct DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
24+
primary: Vec<(&'a str, &'a str, usize)>,
25+
suffixes: BTreeSet<&'a str>,
26+
dense: Vec<Row<'a>>,
27+
delimiter: u8,
28+
}
29+
30+
impl<'a> DenseSparse2dAsciiWithFixedDelimiterBuilder<'a> {
31+
/// Add a prefix and all values associated with the prefix to the builder.
32+
pub(crate) fn add_prefix(
33+
&mut self,
34+
prefix: &'a str,
35+
values: &BTreeMap<&'a str, usize>,
36+
) -> Result<(), ZeroTrieBuildError> {
37+
// TODO: Is there a more Rusty way to compute min and max together?
38+
let mut min = usize::MAX;
39+
let mut max = 0;
40+
for value in values.values() {
41+
min = core::cmp::min(min, *value);
42+
max = core::cmp::max(max, *value);
43+
}
44+
// >= because DenseType::MAX is the sentinel
45+
if max - min >= usize::from(DenseType::MAX) {
46+
// How to implement this when we need it:
47+
// 1. Select the row offset that gets the greatest number of values into the dense matrix.
48+
// 2. Put all out-of-range values into the primary trie, as we do with sparse rows.
49+
todo!("values in row are not in a sufficiently compact range");
50+
}
51+
let sentinel = min + usize::from(DenseType::MAX);
52+
// Partition the entries based on whether they can be encoded as dense
53+
let (dense_or_sparse, sparse_only) = values
54+
.iter()
55+
.map(|(suffix, value)| (*suffix, *value))
56+
.partition::<BTreeMap<&'a str, usize>, _>(|(suffix, _value)| {
57+
// TODO: Also filter for suffixes that are out of range of the dense row offset
58+
self.suffixes.contains(suffix)
59+
});
60+
// Check whether the sparse trie is smaller than the dense row
61+
let sub_trie = dense_or_sparse
62+
.iter()
63+
.map(|(suffix, value)| (*suffix, *value))
64+
.collect::<ZeroTrieSimpleAscii<Vec<u8>>>();
65+
if sub_trie.byte_len() > self.suffixes.len() * core::mem::size_of::<DenseType>() {
66+
// Create a dense prefix
67+
let row_value_offset = min;
68+
let offsets = self
69+
.suffixes
70+
.iter()
71+
.map(|suffix| {
72+
let value = sub_trie.get(suffix).unwrap_or(sentinel);
73+
let Ok(offset) = DenseType::try_from(value - row_value_offset) else {
74+
unreachable!("this should have been handled earlier");
75+
};
76+
offset
77+
})
78+
.collect::<Vec<DenseType>>();
79+
self.dense.push(Row {
80+
prefix,
81+
row_value_offset,
82+
offsets,
83+
});
84+
for (suffix, value) in sparse_only.iter() {
85+
self.primary.push((prefix, *suffix, *value));
86+
}
87+
Ok(())
88+
} else {
89+
// Create a sparse prefix
90+
for (suffix, value) in values.iter() {
91+
self.primary.push((prefix, *suffix, *value));
92+
}
93+
Ok(())
94+
}
95+
}
96+
97+
/// Assemble the intermediate structures into the final layout.
98+
pub(crate) fn build(mut self) -> Result<ZeroAsciiDenseSparse2dTrieOwned, ZeroTrieBuildError> {
99+
self.dense.sort();
100+
let Ok(suffix_count) = DenseType::try_from(self.suffixes.len()) else {
101+
return Err(ZeroTrieBuildError::CapacityExceeded);
102+
};
103+
let delimiter = self.delimiter as char;
104+
let mut primary_contents = BTreeMap::new();
105+
for (prefix, suffix, value) in self.primary.iter() {
106+
if prefix.contains(delimiter) || suffix.contains(delimiter) {
107+
debug_assert!(false, "handled earlier");
108+
return Err(ZeroTrieBuildError::IllegalDelimiter);
109+
}
110+
let mut delimited = String::with_capacity(prefix.len() + suffix.len() + 1);
111+
delimited.push_str(prefix);
112+
delimited.push(delimiter);
113+
delimited.push_str(suffix);
114+
primary_contents.insert(delimited, *value);
115+
}
116+
let mut dense = Vec::<DenseType>::with_capacity(self.dense.len() * self.suffixes.len());
117+
for (
118+
row_index,
119+
Row {
120+
prefix,
121+
row_value_offset,
122+
offsets,
123+
},
124+
) in self.dense.iter().enumerate()
125+
{
126+
primary_contents.insert(String::from(*prefix), row_index);
127+
let mut prefix_with_delim = String::with_capacity(prefix.len() + 1);
128+
prefix_with_delim.push_str(prefix);
129+
prefix_with_delim.push(delimiter);
130+
primary_contents.insert(prefix_with_delim, *row_value_offset);
131+
dense.extend(offsets);
132+
}
133+
let suffixes = self
134+
.suffixes
135+
.iter()
136+
.enumerate()
137+
.map(|(column_index, suffix)| (*suffix, column_index))
138+
.collect::<BTreeMap<&str, usize>>();
139+
Ok(ZeroAsciiDenseSparse2dTrieOwned {
140+
primary: ZeroTrieSimpleAscii::try_from_btree_map_str(&primary_contents)?,
141+
suffixes: ZeroTrieSimpleAscii::try_from_btree_map_str(&suffixes)?,
142+
dense: ZeroVec::from_slice_or_alloc(dense.as_slice()).into_owned(),
143+
suffix_count,
144+
delimiter: self.delimiter,
145+
})
146+
}
147+
}
148+
149+
impl ZeroAsciiDenseSparse2dTrieOwned {
150+
/// Builds one of these from a two-dimensional BTreeMap and a delimiter.
151+
///
152+
/// Keep in mind the recommendations for optimal data size described in
153+
/// the [class docs].
154+
///
155+
/// [class docs]: ZeroAsciiDenseSparse2dTrieOwned
156+
pub fn try_from_btree_map_str(
157+
entries: &BTreeMap<&str, BTreeMap<&str, usize>>,
158+
delimiter: u8,
159+
) -> Result<Self, ZeroTrieBuildError> {
160+
let mut builder = DenseSparse2dAsciiWithFixedDelimiterBuilder {
161+
delimiter,
162+
..Default::default()
163+
};
164+
// TODO: Prune low-frequency suffixes.
165+
// For now, build with all suffixes.
166+
builder.suffixes = entries
167+
.values()
168+
.flat_map(|inner| inner.keys())
169+
.copied()
170+
.map(|s| {
171+
if s.contains(delimiter as char) {
172+
Err(ZeroTrieBuildError::IllegalDelimiter)
173+
} else {
174+
Ok(s)
175+
}
176+
})
177+
.collect::<Result<_, ZeroTrieBuildError>>()?;
178+
for (prefix, values) in entries.iter() {
179+
if prefix.contains(delimiter as char) {
180+
return Err(ZeroTrieBuildError::IllegalDelimiter);
181+
}
182+
builder.add_prefix(prefix, values)?;
183+
}
184+
builder.build()
185+
}
186+
}

utils/zerotrie/src/builder/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@
151151
152152
mod branch_meta;
153153
pub(crate) mod bytestr;
154+
#[cfg(all(feature = "alloc", feature = "dense"))]
155+
pub(crate) mod dense;
154156
pub(crate) mod konst;
155157
#[cfg(feature = "litemap")]
156158
mod litemap;

0 commit comments

Comments
 (0)