Skip to content

Commit 3903909

Browse files
committed
Test different chunk-sizes
1 parent 6f9caa3 commit 3903909

File tree

3 files changed

+225
-66
lines changed

3 files changed

+225
-66
lines changed

build/ucd_generator.rs

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
21
mod code_point_description;
3-
mod parsers;
4-
mod download;
52
mod column;
3+
mod download;
64
mod generators;
5+
mod parsers;
76

8-
use thiserror::Error;
97
use code_point_description::*;
8+
use thiserror::Error;
109

1110
#[derive(Error, Debug)]
1211
pub enum Error {
@@ -18,25 +17,53 @@ pub enum Error {
1817
Parser(#[from] parsers::Error),
1918
}
2019

20+
fn generate_enum_table<'a>(
21+
code_dir: &std::path::Path,
22+
name: &str,
23+
mut enum_table: Vec<String>,
24+
op: impl Fn(usize) -> &'a String,
25+
) -> Result<(), Error> {
26+
let column = column::map_str_to_int(&mut enum_table, op);
2127

22-
pub fn ucd_generator(ucd_base_url: &str, ucd_version: &str, data_dir: &std::path::Path, code_dir: &std::path::Path) -> Result<(), Error>
23-
{
28+
let (dedup, dedup_bits, index, index_bits, chunk_size) = column::dedup_best_fit(&column);
29+
30+
generators::generate_enum_table(
31+
&code_dir,
32+
name,
33+
&enum_table,
34+
&dedup,
35+
dedup_bits,
36+
&index,
37+
index_bits,
38+
chunk_size,
39+
)?;
40+
41+
return Ok(());
42+
}
43+
44+
pub fn ucd_generator(
45+
ucd_base_url: &str,
46+
ucd_version: &str,
47+
data_dir: &std::path::Path,
48+
code_dir: &std::path::Path,
49+
) -> Result<(), Error> {
2450
let mut code_point_descriptions = Vec::<CodePointDescription>::with_capacity(0x110000);
2551
code_point_descriptions.resize(0x110000, CodePointDescription::new());
2652

27-
parsers::parse_east_asian_width(&ucd_base_url, &ucd_version, &data_dir, &mut code_point_descriptions)?;
53+
parsers::parse_east_asian_width(
54+
&ucd_base_url,
55+
&ucd_version,
56+
&data_dir,
57+
&mut code_point_descriptions,
58+
)?;
2859

29-
const EAST_ASIAN_WIDTH_CHUNK_SIZE : usize = 256;
30-
let mut east_asian_width_enum = vec!["N".to_string()];
31-
let mut east_asian_width_column = column::map_str_to_int(&mut east_asian_width_enum, |x| &code_point_descriptions[x].east_asian_width);
32-
let east_asian_width_index = column::dedup(&mut east_asian_width_column, EAST_ASIAN_WIDTH_CHUNK_SIZE);
33-
generators::generate_enum_table(&code_dir, "east_asian_width", &east_asian_width_enum, &east_asian_width_column, &east_asian_width_index, EAST_ASIAN_WIDTH_CHUNK_SIZE)?;
60+
generate_enum_table(code_dir, "east_asian_width", vec!["N".to_string()], |x| {
61+
&code_point_descriptions[x].east_asian_width
62+
})?;
3463

3564
//if let Err(e) = parse_line_break_properties(&ucd_base_url, &ucd_version, &data_dir, &mut code_point_descriptions) {
3665
// return Err(e);
3766
//}
3867

39-
4068
return Ok(());
4169
}
42-

build/ucd_generator/column.rs

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,29 @@
1010
/// - `dst_chunk`: The index to the chunk where the data will be copied to.
1111
/// - `src_chunk`: The index to the chunk where the data will be copied from.
1212
/// - `chunk_size`: The size of the chunks.
13-
///
14-
fn copy_chunk(column: &mut Vec<usize>, dst_chunk: usize, src_chunk: usize, chunk_size: usize)
13+
///
14+
fn copy_chunk(src: &Vec<usize>, src_chunk: usize, dst: &mut Vec<usize>, dst_chunk: usize, chunk_size: usize)
1515
{
1616
let dst_offset = dst_chunk * chunk_size;
1717
let src_offset = src_chunk * chunk_size;
1818

19+
let dst_size = dst_offset + chunk_size;
20+
if dst.len() < dst_size {
21+
dst.resize(dst_size, 0);
22+
}
23+
1924
for i in 0..chunk_size {
20-
column[dst_offset + i] = column[src_offset + i];
25+
dst[dst_offset + i] = src[src_offset + i];
2126
}
2227
}
2328

24-
fn test_chunk(column: &Vec<usize>, a: usize, b: usize, chunk_size: usize) -> bool
29+
fn test_chunk(src: &Vec<usize>, src_chunk: usize, dst: &Vec<usize>, dst_chunk: usize, chunk_size: usize) -> bool
2530
{
26-
let a_offset = a * chunk_size;
27-
let b_offset = b * chunk_size;
31+
let src_offset = src_chunk * chunk_size;
32+
let dst_offset = dst_chunk * chunk_size;
2833

2934
for i in 0..chunk_size {
30-
if column[a_offset + i] != column[b_offset + i] {
35+
if dst[dst_offset + i] != src[src_offset + i] {
3136
return false;
3237
}
3338
}
@@ -38,11 +43,12 @@ fn test_chunk(column: &Vec<usize>, a: usize, b: usize, chunk_size: usize) -> boo
3843
/// `src_chunk`.
3944
///
4045
/// # Arguments
41-
/// - `column`: The vector to deduplicate.
46+
/// - `src': The data source.
47+
/// - `src_chunk`: The chunk-index to the source-chunk.
48+
/// - `dst`: The deduplicated table.
4249
/// - `dst_chunk`: The chunk-index to the destination-chunk where the
4350
/// source-chunk will be copied to; if all chunks from
4451
/// `0..dst_chunk` are unequal to source-chunk.
45-
/// - `src_chunk`: The chunk-index to the source-chunk.
4652
/// - `chunk_size`: The size of the chunks.
4753
///
4854
/// # Returns
@@ -52,11 +58,11 @@ fn test_chunk(column: &Vec<usize>, a: usize, b: usize, chunk_size: usize) -> boo
5258
/// The `dst_chunk` is returned so that you can directly copy the source-chunk
5359
/// to the correct position.
5460
///
55-
fn test_chunks(column: &Vec<usize>, dst_chunk: usize, src_chunk: usize, chunk_size: usize) -> usize
61+
fn test_chunks(src: &Vec<usize>, src_chunk: usize, dst: &Vec<usize>, dst_chunk: usize, chunk_size: usize) -> usize
5662
{
57-
for a_chunk in 0..dst_chunk {
58-
if test_chunk(column, a_chunk, src_chunk, chunk_size) {
59-
return a_chunk;
63+
for i in 0..dst_chunk {
64+
if test_chunk(src, src_chunk, dst, i, chunk_size) {
65+
return i;
6066
}
6167
}
6268
return dst_chunk;
@@ -69,32 +75,30 @@ fn test_chunks(column: &Vec<usize>, dst_chunk: usize, src_chunk: usize, chunk_si
6975
/// - `chunk_size` : The size of the chunks.
7076
///
7177
/// # Returns
72-
/// The index-table.
78+
/// (dedupped-data-table, index-table).
7379
///
7480
/// To find a specific entry `i` in the deduplicated column:
7581
/// - `chunk_nr = min(i / chunk_size, index_table.len())`
7682
/// - `offset = index_table[chunk_nr] * chunk_size + i % chunk_size`
7783
/// - `entry = column[offset]`
7884
///
79-
pub fn dedup(column: &mut Vec<usize>, chunk_size: usize) -> Vec<usize>
85+
pub fn dedup(column: &Vec<usize>, chunk_size: usize) -> (Vec<usize>, Vec<usize>)
8086
{
8187
let num_chunks = 0x110000 / chunk_size;
8288
let mut index_table = Vec::<usize>::new();
89+
let mut dedup_table = Vec::<usize>::new();
8390

8491
// Deduplicating the column table and create an index table.
8592
let mut dst_chunk = 0;
8693
for src_chunk in 0..num_chunks {
87-
let found_chunk = test_chunks(column, dst_chunk, src_chunk, chunk_size);
94+
let found_chunk = test_chunks(column, src_chunk, &dedup_table, dst_chunk, chunk_size);
8895
index_table.push(found_chunk);
8996
if found_chunk == dst_chunk {
90-
copy_chunk(column, dst_chunk, src_chunk, chunk_size);
97+
copy_chunk(column, src_chunk, &mut dedup_table, dst_chunk, chunk_size);
9198
dst_chunk += 1;
9299
}
93100
}
94101

95-
// Truncate the column, after deduplicating the column.
96-
column.truncate(dst_chunk * chunk_size);
97-
98102
// Truncate the index_table, so that the last value is not repeating.
99103
if let Some(&last_value) = index_table.last() {
100104
loop {
@@ -112,7 +116,42 @@ pub fn dedup(column: &mut Vec<usize>, chunk_size: usize) -> Vec<usize>
112116
index_table.push(last_value);
113117
}
114118

115-
return index_table;
119+
return (dedup_table, index_table);
120+
}
121+
122+
pub fn dedup_best_fit(column: &Vec<usize>) -> (Vec<usize>, usize, Vec<usize>, usize, usize)
123+
{
124+
let chunk_sizes = vec![
125+
32 as usize,
126+
64 as usize,
127+
128 as usize,
128+
256 as usize,
129+
512 as usize,
130+
];
131+
132+
let mut best_chunk_size: usize = 0;
133+
let mut best_dedup = Vec::<usize>::new();
134+
let mut best_index = Vec::<usize>::new();
135+
let mut best_dedup_bits: usize = 0;
136+
let mut best_index_bits: usize = 0;
137+
let mut best_byte_len: usize = 0;
138+
for chunk_size in chunk_sizes {
139+
let (dedup, index) = dedup(&column, chunk_size);
140+
let dedup_bits = get_width(&dedup);
141+
let index_bits = get_width(&index);
142+
143+
let byte_len = dedup.len() * dedup_bits + index.len() * index_bits;
144+
if best_byte_len == 0 || byte_len < best_byte_len {
145+
best_chunk_size = chunk_size;
146+
best_dedup = dedup;
147+
best_index = index;
148+
best_dedup_bits = dedup_bits;
149+
best_index_bits = index_bits;
150+
best_byte_len = byte_len;
151+
}
152+
}
153+
154+
return (best_dedup, best_dedup_bits, best_index, best_index_bits, best_chunk_size);
116155
}
117156

118157
pub fn map_str_to_int<'a>(order: &mut Vec<String>, op: impl Fn(usize) -> &'a String) -> Vec<usize>
@@ -166,3 +205,8 @@ pub fn compress(input: &Vec<usize>, num_bits: usize) -> Vec<u8>
166205
return r;
167206
}
168207

208+
pub fn get_width(input: &Vec<usize>) -> usize
209+
{
210+
let max = *input.iter().max().unwrap_or(&0);
211+
return (max + 1).next_power_of_two().trailing_zeros() as usize;
212+
}

0 commit comments

Comments
 (0)