Skip to content

Commit cd0f6ae

Browse files
committed
more tables
1 parent 71d9e7b commit cd0f6ae

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3177
-71
lines changed

.rustfmt.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
max_width = 130
2+
fn_call_width = 130

build_src/mod.rs

Lines changed: 76 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -21,142 +21,147 @@ fn generate_enum_table<'a>(
2121
code_dir: &std::path::Path,
2222
name: &str,
2323
mut enum_table: Vec<String>,
24-
op: impl Fn(usize) -> &'a String
25-
) -> Result<(), Error>
26-
{
24+
op: impl Fn(usize) -> &'a String,
25+
) -> Result<(), Error> {
2726
let column = column::map_str_to_int(&mut enum_table, op);
2827

2928
let (dedup, dedup_bits, index, index_bits, chunk_size) = column::dedup_best_fit(&column);
3029

31-
generators::generate_enum_table(
32-
&code_dir,
33-
name,
34-
&enum_table,
35-
&dedup,
36-
dedup_bits,
37-
&index,
38-
index_bits,
39-
chunk_size,
40-
)?;
30+
generators::generate_enum_table(&code_dir, name, &enum_table, &dedup, dedup_bits, &index, index_bits, chunk_size)?;
4131

4232
return Ok(());
4333
}
4434

45-
fn generate_bool_table(
46-
code_dir: &std::path::Path,
47-
name: &str,
48-
op: impl Fn(usize) -> bool
49-
) -> Result<(), Error>
50-
{
35+
fn generate_bool_table(code_dir: &std::path::Path, name: &str, op: impl Fn(usize) -> bool) -> Result<(), Error> {
5136
let column = column::map_bool_to_int(op);
5237

5338
let (dedup, dedup_bits, index, index_bits, chunk_size) = column::dedup_best_fit(&column);
5439
assert!(dedup_bits == 1);
5540

56-
generators::generate_bool_table(
57-
&code_dir,
58-
name,
59-
&dedup,
60-
&index,
61-
index_bits,
62-
chunk_size,
63-
)?;
41+
generators::generate_bool_table(&code_dir, name, &dedup, &index, index_bits, chunk_size)?;
6442

6543
return Ok(());
6644
}
6745

68-
pub fn build(
69-
ucd_base_url: &str,
70-
ucd_version: &str,
71-
data_dir: &std::path::Path,
72-
code_dir: &std::path::Path,
73-
) -> Result<(), Error> {
46+
pub fn build(ucd_base_url: &str, ucd_version: &str, data_dir: &std::path::Path, code_dir: &std::path::Path) -> Result<(), Error> {
7447
let mut code_point_descriptions = Vec::<CodePointDescription>::with_capacity(0x110000);
7548
code_point_descriptions.resize(0x110000, CodePointDescription::new());
7649

7750
parsers::parse_single_column(
7851
&format!("{}/{}/ucd/EastAsianWidth.txt", &ucd_base_url, &ucd_version),
7952
&data_dir.join(&ucd_version).join("ucd").join("EastAsianWidth.txt"),
8053
&mut code_point_descriptions,
81-
|x| &mut x.east_asian_width
54+
|x| &mut x.east_asian_width,
8255
)?;
8356

8457
parsers::parse_single_column(
8558
&format!("{}/{}/ucd/LineBreak.txt", &ucd_base_url, &ucd_version),
8659
&data_dir.join(&ucd_version).join("ucd").join("LineBreak.txt"),
8760
&mut code_point_descriptions,
88-
|x| &mut x.line_break
61+
|x| &mut x.line_break,
8962
)?;
9063

9164
parsers::parse_single_column(
9265
&format!("{}/{}/ucd/auxiliary/WordBreakProperty.txt", &ucd_base_url, &ucd_version),
93-
&data_dir.join(&ucd_version).join("ucd").join("auxiliary").join("WordBreakProperty.txt"),
66+
&data_dir
67+
.join(&ucd_version)
68+
.join("ucd")
69+
.join("auxiliary")
70+
.join("WordBreakProperty.txt"),
9471
&mut code_point_descriptions,
95-
|x| &mut x.word_break
72+
|x| &mut x.word_break,
9673
)?;
9774

9875
parsers::parse_single_column(
9976
&format!("{}/{}/ucd/auxiliary/SentenceBreakProperty.txt", &ucd_base_url, &ucd_version),
100-
&data_dir.join(&ucd_version).join("ucd").join("auxiliary").join("SentenceBreakProperty.txt"),
77+
&data_dir
78+
.join(&ucd_version)
79+
.join("ucd")
80+
.join("auxiliary")
81+
.join("SentenceBreakProperty.txt"),
10182
&mut code_point_descriptions,
102-
|x| &mut x.sentence_break
83+
|x| &mut x.sentence_break,
10384
)?;
10485

10586
parsers::parse_single_column(
10687
&format!("{}/{}/ucd/auxiliary/GraphemeBreakProperty.txt", &ucd_base_url, &ucd_version),
107-
&data_dir.join(&ucd_version).join("ucd").join("auxiliary").join("GraphemeBreakProperty.txt"),
88+
&data_dir
89+
.join(&ucd_version)
90+
.join("ucd")
91+
.join("auxiliary")
92+
.join("GraphemeBreakProperty.txt"),
10893
&mut code_point_descriptions,
109-
|x| &mut x.grapheme_break
94+
|x| &mut x.grapheme_break,
11095
)?;
11196

11297
parsers::parse_single_column(
11398
&format!("{}/{}/ucd/Scripts.txt", &ucd_base_url, &ucd_version),
11499
&data_dir.join(&ucd_version).join("ucd").join("Scripts.txt"),
115100
&mut code_point_descriptions,
116-
|x| &mut x.script
101+
|x| &mut x.script,
117102
)?;
118103

119104
parsers::parse_existance_column(
120105
&format!("{}/{}/ucd/CompositionExclusions.txt", &ucd_base_url, &ucd_version),
121106
&data_dir.join(&ucd_version).join("ucd").join("CompositionExclusions.txt"),
122107
&mut code_point_descriptions,
123-
|x| &mut x.composition_exclusion
108+
|x| &mut x.composition_exclusion,
124109
)?;
125110

126111
parsers::parse_prop_list_columns(
127112
&format!("{}/{}/ucd/PropList.txt", &ucd_base_url, &ucd_version),
128113
&data_dir.join(&ucd_version).join("ucd").join("PropList.txt"),
129-
&mut code_point_descriptions
114+
&mut code_point_descriptions,
130115
)?;
131116

132-
generate_enum_table(code_dir, "east_asian_width", vec!["N".to_string()], |x| {
133-
&code_point_descriptions[x].east_asian_width
117+
generate_enum_table(code_dir, "east_asian_width", vec!["N".to_string()], |x| &code_point_descriptions[x].east_asian_width)?;
118+
generate_enum_table(code_dir, "line_break", vec!["N".to_string()], |x| &code_point_descriptions[x].line_break)?;
119+
generate_enum_table(code_dir, "word_break", vec!["N".to_string()], |x| &code_point_descriptions[x].word_break)?;
120+
generate_enum_table(code_dir, "sentence_break", vec!["N".to_string()], |x| &code_point_descriptions[x].sentence_break)?;
121+
generate_enum_table(code_dir, "grapheme_break", vec!["N".to_string()], |x| &code_point_descriptions[x].grapheme_break)?;
122+
generate_enum_table(code_dir, "script", vec!["N".to_string()], |x| &code_point_descriptions[x].script)?;
123+
124+
generate_bool_table(code_dir, "composition_exclusion", |x| code_point_descriptions[x].composition_exclusion)?;
125+
generate_bool_table(code_dir, "white_space", |x| code_point_descriptions[x].white_space)?;
126+
generate_bool_table(code_dir, "bidi_control", |x| code_point_descriptions[x].bidi_control)?;
127+
generate_bool_table(code_dir, "join_control", |x| code_point_descriptions[x].join_control)?;
128+
generate_bool_table(code_dir, "dash", |x| code_point_descriptions[x].dash)?;
129+
generate_bool_table(code_dir, "hyphen", |x| code_point_descriptions[x].hyphen)?;
130+
generate_bool_table(code_dir, "quotation_mark", |x| code_point_descriptions[x].quotation_mark)?;
131+
generate_bool_table(code_dir, "terminal_punctuation", |x| code_point_descriptions[x].terminal_punctuation)?;
132+
generate_bool_table(code_dir, "other_math", |x| code_point_descriptions[x].other_math)?;
133+
generate_bool_table(code_dir, "hex_digit", |x| code_point_descriptions[x].hex_digit)?;
134+
generate_bool_table(code_dir, "ascii_hex_digit", |x| code_point_descriptions[x].ascii_hex_digit)?;
135+
generate_bool_table(code_dir, "other_alphabetic", |x| code_point_descriptions[x].other_alphabetic)?;
136+
generate_bool_table(code_dir, "ideographic", |x| code_point_descriptions[x].ideographic)?;
137+
generate_bool_table(code_dir, "diacritic", |x| code_point_descriptions[x].diacritic)?;
138+
generate_bool_table(code_dir, "extender", |x| code_point_descriptions[x].extender)?;
139+
generate_bool_table(code_dir, "other_lowercase", |x| code_point_descriptions[x].other_lowercase)?;
140+
generate_bool_table(code_dir, "other_uppercase", |x| code_point_descriptions[x].other_uppercase)?;
141+
generate_bool_table(code_dir, "noncharacter_code_point", |x| code_point_descriptions[x].noncharacter_code_point)?;
142+
generate_bool_table(code_dir, "other_grapheme_extend", |x| code_point_descriptions[x].other_grapheme_extend)?;
143+
generate_bool_table(code_dir, "ids_unary_operator", |x| code_point_descriptions[x].ids_unary_operator)?;
144+
generate_bool_table(code_dir, "ids_binary_operator", |x| code_point_descriptions[x].ids_binary_operator)?;
145+
generate_bool_table(code_dir, "ids_trinary_operator", |x| code_point_descriptions[x].ids_trinary_operator)?;
146+
generate_bool_table(code_dir, "radical", |x| code_point_descriptions[x].radical)?;
147+
generate_bool_table(code_dir, "unified_ideograph", |x| code_point_descriptions[x].unified_ideograph)?;
148+
generate_bool_table(code_dir, "other_default_ignorable_code_point", |x| {
149+
code_point_descriptions[x].other_default_ignorable_code_point
134150
})?;
135-
136-
generate_enum_table(code_dir, "line_break", vec!["N".to_string()], |x| {
137-
&code_point_descriptions[x].line_break
138-
})?;
139-
140-
generate_enum_table(code_dir, "word_break", vec!["N".to_string()], |x| {
141-
&code_point_descriptions[x].word_break
142-
})?;
143-
144-
generate_enum_table(code_dir, "sentence_break", vec!["N".to_string()], |x| {
145-
&code_point_descriptions[x].sentence_break
146-
})?;
147-
148-
generate_enum_table(code_dir, "grapheme_break", vec!["N".to_string()], |x| {
149-
&code_point_descriptions[x].grapheme_break
150-
})?;
151-
152-
generate_enum_table(code_dir, "script", vec!["N".to_string()], |x| {
153-
&code_point_descriptions[x].script
154-
})?;
155-
156-
generate_bool_table(code_dir, "composition_exclusion", |x| {
157-
code_point_descriptions[x].composition_exclusion
158-
})?;
159-
151+
generate_bool_table(code_dir, "deprecated", |x| code_point_descriptions[x].deprecated)?;
152+
generate_bool_table(code_dir, "soft_dotted", |x| code_point_descriptions[x].soft_dotted)?;
153+
generate_bool_table(code_dir, "logical_order_exception", |x| code_point_descriptions[x].logical_order_exception)?;
154+
generate_bool_table(code_dir, "other_id_start", |x| code_point_descriptions[x].other_id_start)?;
155+
generate_bool_table(code_dir, "other_id_continue", |x| code_point_descriptions[x].other_id_continue)?;
156+
generate_bool_table(code_dir, "id_compat_math_continue", |x| code_point_descriptions[x].id_compat_math_continue)?;
157+
generate_bool_table(code_dir, "id_compat_math_start", |x| code_point_descriptions[x].id_compat_math_start)?;
158+
generate_bool_table(code_dir, "sentence_terminal", |x| code_point_descriptions[x].sentence_terminal)?;
159+
generate_bool_table(code_dir, "variation_selector", |x| code_point_descriptions[x].variation_selector)?;
160+
generate_bool_table(code_dir, "pattern_white_space", |x| code_point_descriptions[x].pattern_white_space)?;
161+
generate_bool_table(code_dir, "pattern_syntax", |x| code_point_descriptions[x].pattern_syntax)?;
162+
generate_bool_table(code_dir, "prepended_concatenation_mark", |x| code_point_descriptions[x].prepended_concatenation_mark)?;
163+
generate_bool_table(code_dir, "regional_indicator", |x| code_point_descriptions[x].regional_indicator)?;
164+
generate_bool_table(code_dir, "modifier_combining_mark", |x| code_point_descriptions[x].modifier_combining_mark)?;
160165

161166
return Ok(());
162167
}

src/ascii_hex_digit.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// This file was generated by the cargo-build script.
2+
3+
const ASCII_HEX_DIGIT_CHUNK_SIZE : usize = 32;
4+
const ASCII_HEX_DIGIT_COLUMN_BITS : usize = 1;
5+
const ASCII_HEX_DIGIT_INDEX_LEN : usize = 5;
6+
const ASCII_HEX_DIGIT_INDEX_BITS : usize = 2;
7+
8+
const ASCII_HEX_DIGIT_INDEX_BYTE_OFFSET : usize = 12;
9+
10+
const ASCII_HEX_DIGIT_DATA: [u8; 15] = [
11+
// Column table
12+
0, 0, 0, 0, 0, 0,255, 3,126, 0, 0, 0,
13+
// Index table
14+
164, 0,
15+
// Padding to handle unaligned word reads.
16+
0,
17+
];
18+
19+
/// Get the AsciiHexDigit attribute for a Unicode code-point.
20+
///
21+
/// # Arguments
22+
/// - `code_point` A code-point in the form of a rust `char`.
23+
///
24+
/// # Returns
25+
/// bool value
26+
#[must_use] pub const fn get_ascii_hex_digit(code_point: char) -> bool
27+
{
28+
const INDEX_MASK : usize = (1 << ASCII_HEX_DIGIT_INDEX_BITS) - 1;
29+
const COLUMN_MASK : usize = 1;
30+
31+
let code_point_value = code_point as usize;
32+
let code_point_lo = code_point_value % ASCII_HEX_DIGIT_CHUNK_SIZE;
33+
let mut code_point_hi = code_point_value / ASCII_HEX_DIGIT_CHUNK_SIZE;
34+
if code_point_hi > ASCII_HEX_DIGIT_INDEX_LEN - 1 {
35+
code_point_hi = ASCII_HEX_DIGIT_INDEX_LEN - 1;
36+
}
37+
38+
let index_offset = code_point_hi * ASCII_HEX_DIGIT_INDEX_BITS;
39+
let index_byte_offset = index_offset / 8;
40+
let index_bit_offset = index_offset % 8;
41+
let mut index: usize = 0;
42+
index |= (ASCII_HEX_DIGIT_DATA[ASCII_HEX_DIGIT_INDEX_BYTE_OFFSET + index_byte_offset + 1] as usize) << 8;
43+
index |= (ASCII_HEX_DIGIT_DATA[ASCII_HEX_DIGIT_INDEX_BYTE_OFFSET + index_byte_offset + 0] as usize) << 0;
44+
index >>= index_bit_offset;
45+
index &= INDEX_MASK;
46+
47+
let column_offset = (index * ASCII_HEX_DIGIT_CHUNK_SIZE + code_point_lo) * ASCII_HEX_DIGIT_COLUMN_BITS;
48+
let column_byte_offset = column_offset / 8;
49+
let column_bit_offset = column_offset % 8;
50+
51+
let mut value: usize = 0;
52+
value |= (ASCII_HEX_DIGIT_DATA[column_byte_offset + 0] as usize) << 0;
53+
value >>= column_bit_offset;
54+
value &= COLUMN_MASK;
55+
56+
return match value {
57+
0 => false,
58+
_ => true,
59+
};
60+
}
61+

src/bidi_control.rs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// This file was generated by the cargo-build script.
2+
3+
const BIDI_CONTROL_CHUNK_SIZE : usize = 128;
4+
const BIDI_CONTROL_COLUMN_BITS : usize = 1;
5+
const BIDI_CONTROL_INDEX_LEN : usize = 66;
6+
const BIDI_CONTROL_INDEX_BITS : usize = 2;
7+
8+
const BIDI_CONTROL_INDEX_BYTE_OFFSET : usize = 48;
9+
10+
const BIDI_CONTROL_DATA: [u8; 66] = [
11+
// Column table
12+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13+
0,192, 0, 0, 0,124, 0, 0, 0, 0, 0, 0,192, 3, 0, 0,
14+
// Index table
15+
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
16+
// Padding to handle unaligned word reads.
17+
0,
18+
];
19+
20+
/// Get the BidiControl attribute for a Unicode code-point.
21+
///
22+
/// # Arguments
23+
/// - `code_point` A code-point in the form of a rust `char`.
24+
///
25+
/// # Returns
26+
/// bool value
27+
#[must_use] pub const fn get_bidi_control(code_point: char) -> bool
28+
{
29+
const INDEX_MASK : usize = (1 << BIDI_CONTROL_INDEX_BITS) - 1;
30+
const COLUMN_MASK : usize = 1;
31+
32+
let code_point_value = code_point as usize;
33+
let code_point_lo = code_point_value % BIDI_CONTROL_CHUNK_SIZE;
34+
let mut code_point_hi = code_point_value / BIDI_CONTROL_CHUNK_SIZE;
35+
if code_point_hi > BIDI_CONTROL_INDEX_LEN - 1 {
36+
code_point_hi = BIDI_CONTROL_INDEX_LEN - 1;
37+
}
38+
39+
let index_offset = code_point_hi * BIDI_CONTROL_INDEX_BITS;
40+
let index_byte_offset = index_offset / 8;
41+
let index_bit_offset = index_offset % 8;
42+
let mut index: usize = 0;
43+
index |= (BIDI_CONTROL_DATA[BIDI_CONTROL_INDEX_BYTE_OFFSET + index_byte_offset + 1] as usize) << 8;
44+
index |= (BIDI_CONTROL_DATA[BIDI_CONTROL_INDEX_BYTE_OFFSET + index_byte_offset + 0] as usize) << 0;
45+
index >>= index_bit_offset;
46+
index &= INDEX_MASK;
47+
48+
let column_offset = (index * BIDI_CONTROL_CHUNK_SIZE + code_point_lo) * BIDI_CONTROL_COLUMN_BITS;
49+
let column_byte_offset = column_offset / 8;
50+
let column_bit_offset = column_offset % 8;
51+
52+
let mut value: usize = 0;
53+
value |= (BIDI_CONTROL_DATA[column_byte_offset + 0] as usize) << 0;
54+
value >>= column_bit_offset;
55+
value &= COLUMN_MASK;
56+
57+
return match value {
58+
0 => false,
59+
_ => true,
60+
};
61+
}
62+

0 commit comments

Comments
 (0)