|
5 | 5 | //! |
6 | 6 | //! <https://github.com/trinodb/tpch/blob/master/src/main/java/io/trino/tpch/TextPool.java> |
7 | 7 |
|
8 | | -use crate::{ |
9 | | - distribution::{Distribution, Distributions}, |
10 | | - random::RowRandomInt, |
11 | | -}; |
| 8 | +use crate::{distribution::Distributions, random::RowRandomInt}; |
12 | 9 | use std::sync::OnceLock; |
13 | 10 |
|
14 | 11 | /// Pool of random text that follows TPC-H grammar. |
@@ -157,201 +154,3 @@ impl TextPool { |
157 | 154 | } |
158 | 155 | } |
159 | 156 | } |
160 | | - |
161 | | -#[derive(Debug)] |
162 | | -pub struct TextPoolGenerator { |
163 | | - size: usize, |
164 | | - |
165 | | - grammars: ParsedDistribution, |
166 | | - noun_phrases: ParsedDistribution, |
167 | | - verb_phrases: ParsedDistribution, |
168 | | - prepositions: IndexedDistribution, |
169 | | - terminators: IndexedDistribution, |
170 | | - adverbs: IndexedDistribution, |
171 | | - verbs: IndexedDistribution, |
172 | | - auxiliaries: IndexedDistribution, |
173 | | - articles: IndexedDistribution, |
174 | | - adjectives: IndexedDistribution, |
175 | | - nouns: IndexedDistribution, |
176 | | -} |
177 | | - |
178 | | -impl TextPoolGenerator { |
179 | | - const MAX_SENTENCE_LENGTH: usize = 256; |
180 | | - |
181 | | - pub fn new(size: usize, distributions: &Distributions) -> Self { |
182 | | - TextPoolGenerator { |
183 | | - size, |
184 | | - grammars: ParsedDistribution::new(distributions.grammar()), |
185 | | - noun_phrases: ParsedDistribution::new(distributions.noun_phrase()), |
186 | | - verb_phrases: ParsedDistribution::new(distributions.verb_phrase()), |
187 | | - prepositions: IndexedDistribution::new(distributions.prepositions()), |
188 | | - terminators: IndexedDistribution::new(distributions.terminators()), |
189 | | - adverbs: IndexedDistribution::new(distributions.adverbs()), |
190 | | - verbs: IndexedDistribution::new(distributions.verbs()), |
191 | | - auxiliaries: IndexedDistribution::new(distributions.auxiliaries()), |
192 | | - articles: IndexedDistribution::new(distributions.articles()), |
193 | | - adjectives: IndexedDistribution::new(distributions.adjectives()), |
194 | | - nouns: IndexedDistribution::new(distributions.nouns()), |
195 | | - } |
196 | | - } |
197 | | - |
198 | | - pub fn generate(&mut self) -> String { |
199 | | - let mut output = String::with_capacity(self.size + Self::MAX_SENTENCE_LENGTH); |
200 | | - let mut random_int = RowRandomInt::new(933588178, i32::MAX); |
201 | | - |
202 | | - while output.len() < self.size { |
203 | | - self.generate_sentence(&mut output, &mut random_int); |
204 | | - } |
205 | | - output.truncate(self.size); |
206 | | - output |
207 | | - } |
208 | | - |
209 | | - fn generate_sentence(&self, builder: &mut String, random: &mut RowRandomInt) { |
210 | | - let index = self.grammars.get_random_index(random); |
211 | | - for token in self.grammars.get_tokens(index) { |
212 | | - match token { |
213 | | - 'V' => self.generate_verb_phrase(builder, random), |
214 | | - 'N' => self.generate_noun_phrase(builder, random), |
215 | | - 'P' => { |
216 | | - let preposition = self.prepositions.random_value(random); |
217 | | - builder.push_str(preposition); |
218 | | - builder.push_str(" the "); |
219 | | - self.generate_noun_phrase(builder, random); |
220 | | - } |
221 | | - 'T' => { |
222 | | - // trim trailing space |
223 | | - // terminators should abut previous word |
224 | | - builder.pop(); |
225 | | - let terminator = self.terminators.random_value(random); |
226 | | - builder.push_str(terminator); |
227 | | - } |
228 | | - _ => panic!("Unknown token '{}'", token), |
229 | | - } |
230 | | - |
231 | | - if !builder.ends_with(' ') { |
232 | | - builder.push(' '); |
233 | | - } |
234 | | - } |
235 | | - } |
236 | | - |
237 | | - fn generate_verb_phrase(&self, builder: &mut String, random: &mut RowRandomInt) { |
238 | | - let index = self.verb_phrases.get_random_index(random); |
239 | | - for token in self.verb_phrases.get_tokens(index) { |
240 | | - match token { |
241 | | - 'D' => builder.push_str(self.adverbs.random_value(random)), |
242 | | - 'V' => builder.push_str(self.verbs.random_value(random)), |
243 | | - 'X' => builder.push_str(self.auxiliaries.random_value(random)), |
244 | | - _ => panic!("Unknown token '{}'", token), |
245 | | - } |
246 | | - |
247 | | - // string may end with a comma or such |
248 | | - builder.push_str(self.verb_phrases.get_bonus_text(index)); |
249 | | - |
250 | | - // add a space |
251 | | - builder.push(' '); |
252 | | - } |
253 | | - } |
254 | | - |
255 | | - fn generate_noun_phrase(&self, builder: &mut String, random: &mut RowRandomInt) { |
256 | | - let index = self.noun_phrases.get_random_index(random); |
257 | | - for token in self.noun_phrases.get_tokens(index) { |
258 | | - match token { |
259 | | - 'A' => builder.push_str(self.articles.random_value(random)), |
260 | | - 'J' => builder.push_str(self.adjectives.random_value(random)), |
261 | | - 'D' => builder.push_str(self.adverbs.random_value(random)), |
262 | | - 'N' => builder.push_str(self.nouns.random_value(random)), |
263 | | - _ => panic!("Unknown token '{}'", token), |
264 | | - } |
265 | | - |
266 | | - // string may end with a comma or such |
267 | | - builder.push_str(self.noun_phrases.get_bonus_text(index)); |
268 | | - |
269 | | - // add a space |
270 | | - builder.push(' '); |
271 | | - } |
272 | | - } |
273 | | -} |
274 | | - |
275 | | -#[derive(Debug)] |
276 | | -struct IndexedDistribution { |
277 | | - random_table: Vec<String>, |
278 | | -} |
279 | | - |
280 | | -impl IndexedDistribution { |
281 | | - fn new(distribution: &Distribution) -> Self { |
282 | | - let max_weight = distribution.get_weight(distribution.size() - 1); |
283 | | - let mut random_table = vec![String::new(); max_weight as usize]; |
284 | | - |
285 | | - let mut value_index = 0; |
286 | | - for (i, item) in random_table.iter_mut().enumerate() { |
287 | | - if i >= distribution.get_weight(value_index) as usize { |
288 | | - value_index += 1; |
289 | | - } |
290 | | - *item = distribution.get_value(value_index).to_string(); |
291 | | - } |
292 | | - |
293 | | - IndexedDistribution { random_table } |
294 | | - } |
295 | | - |
296 | | - fn random_value(&self, random: &mut RowRandomInt) -> &str { |
297 | | - let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize; |
298 | | - &self.random_table[random_index] |
299 | | - } |
300 | | -} |
301 | | - |
302 | | -#[derive(Debug)] |
303 | | -struct ParsedDistribution { |
304 | | - parsed_distribution: Vec<Vec<char>>, |
305 | | - bonus_text: Vec<String>, |
306 | | - random_table: Vec<usize>, |
307 | | -} |
308 | | - |
309 | | -impl ParsedDistribution { |
310 | | - fn new(distribution: &Distribution) -> Self { |
311 | | - let size = distribution.size(); |
312 | | - let mut parsed_distribution = Vec::with_capacity(size); |
313 | | - let mut bonus_text = Vec::with_capacity(size); |
314 | | - |
315 | | - for i in 0..size { |
316 | | - let value = distribution.get_value(i); |
317 | | - let tokens: Vec<&str> = value.split_whitespace().collect(); |
318 | | - |
319 | | - let mut chars = Vec::with_capacity(tokens.len()); |
320 | | - for token in &tokens { |
321 | | - chars.push(token.chars().next().unwrap()); |
322 | | - bonus_text.push(token[1..].to_string()); |
323 | | - } |
324 | | - parsed_distribution.push(chars); |
325 | | - } |
326 | | - |
327 | | - let max_weight = distribution.get_weight(size - 1); |
328 | | - let mut random_table = vec![0; max_weight as usize]; |
329 | | - |
330 | | - let mut value_index = 0; |
331 | | - for (i, item) in random_table.iter_mut().enumerate() { |
332 | | - if i >= distribution.get_weight(value_index) as usize { |
333 | | - value_index += 1; |
334 | | - } |
335 | | - *item = value_index; |
336 | | - } |
337 | | - |
338 | | - ParsedDistribution { |
339 | | - parsed_distribution, |
340 | | - bonus_text, |
341 | | - random_table, |
342 | | - } |
343 | | - } |
344 | | - |
345 | | - fn get_random_index(&self, random: &mut RowRandomInt) -> usize { |
346 | | - let random_index = random.next_int(0, self.random_table.len() as i32 - 1) as usize; |
347 | | - self.random_table[random_index] |
348 | | - } |
349 | | - |
350 | | - fn get_tokens(&self, index: usize) -> &[char] { |
351 | | - &self.parsed_distribution[index] |
352 | | - } |
353 | | - |
354 | | - fn get_bonus_text(&self, index: usize) -> &str { |
355 | | - &self.bonus_text[index] |
356 | | - } |
357 | | -} |
0 commit comments