Skip to content

Commit 50053c2

Browse files
committed
east asian width generator
1 parent cf7b1db commit 50053c2

File tree

11 files changed

+200
-133
lines changed

11 files changed

+200
-133
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
data/
22
target/
3+
src/east_asian_width.rs

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ url = "https://www.unicode.org/Public"
1515
reqwest = { version = "0.12", features = ["blocking"] }
1616
cargo_metadata = { version = "0.19" }
1717
regex = { version = "1.11.1" }
18-
18+
thiserror = { version = "2.0.11" }
19+
convert_case = { version = "0.7.1" }

build/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ fn main() {
1616
let ucd_version = root.metadata["ucd"]["version"].as_str().unwrap();
1717
let ucd_url = root.metadata["ucd"]["url"].as_str().unwrap();
1818
let data_dir = std::path::Path::new(&manifest_dir).join("data");
19+
let code_dir = std::path::Path::new(&manifest_dir).join("src");
1920

20-
if let Err(e) = ucd_generator(ucd_url, ucd_version, &data_dir) {
21+
if let Err(e) = ucd_generator(ucd_url, ucd_version, &data_dir, &code_dir) {
2122
println!("cargo::error={}", e);
2223
}
2324
}

build/ucd_generator.rs

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,42 @@
11

22
mod code_point_description;
3-
use code_point_description::*;
4-
mod parse_east_asian_width;
5-
use parse_east_asian_width::*;
3+
mod parsers;
64
mod download;
75
mod column;
8-
mod generator;
6+
mod generators;
7+
8+
use thiserror::Error;
9+
use code_point_description::*;
10+
11+
#[derive(Error, Debug)]
12+
pub enum Error {
13+
#[error("Failed to write")]
14+
IO(#[from] std::io::Error),
15+
#[error("Failed to format")]
16+
Generator(#[from] generators::Error),
17+
#[error("Failed to parse")]
18+
Parser(#[from] parsers::Error),
19+
}
920

10-
pub fn ucd_generator(ucd_base_url: &str, ucd_version: &str, data_dir: &std::path::Path, code_dir: &std::path::Path) -> Result<(), String>
11-
{
12-
const CHUNK_SIZE : usize = 32;
1321

22+
pub fn ucd_generator(ucd_base_url: &str, ucd_version: &str, data_dir: &std::path::Path, code_dir: &std::path::Path) -> Result<(), Error>
23+
{
1424
let mut code_point_descriptions = Vec::<CodePointDescription>::with_capacity(0x110000);
1525
code_point_descriptions.resize(0x110000, CodePointDescription::new());
1626

17-
if let Err(e) = parse_east_asian_width(&ucd_base_url, &ucd_version, &data_dir, &mut code_point_descriptions) {
18-
return Err(e);
19-
}
27+
parsers::parse_east_asian_width(&ucd_base_url, &ucd_version, &data_dir, &mut code_point_descriptions)?;
2028

29+
const EAST_ASIAN_WIDTH_CHUNK_SIZE : usize = 256;
2130
let mut east_asian_width_enum = vec!["N".to_string()];
2231
let mut east_asian_width_column = column::map_str_to_int(&mut east_asian_width_enum, |x| &code_point_descriptions[x].east_asian_width);
23-
let east_asian_width_index = column::dedup(&mut east_asian_width_column, CHUNK_SIZE);
24-
generate_enum_table(&code_dir, "east_asian_width", east_asian_width_enum, east_asian_width_column, east_asian_width_index)?;
32+
let east_asian_width_index = column::dedup(&mut east_asian_width_column, EAST_ASIAN_WIDTH_CHUNK_SIZE);
33+
generators::generate_enum_table(&code_dir, "east_asian_width", &east_asian_width_enum, &east_asian_width_column, &east_asian_width_index, EAST_ASIAN_WIDTH_CHUNK_SIZE)?;
2534

2635
//if let Err(e) = parse_line_break_properties(&ucd_base_url, &ucd_version, &data_dir, &mut code_point_descriptions) {
2736
// return Err(e);
2837
//}
2938

3039

31-
3240
return Ok(());
3341
}
3442

build/ucd_generator/column.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ fn compress_insert_value(bytes: &mut Vec<u8>, offset : usize, mut value : usize)
154154
pub fn compress(input: &Vec<usize>, num_bits: usize) -> Vec<u8>
155155
{
156156
let total_num_bits = num_bits * input.len();
157-
let total_num_bytes = (total_num_bits + 7) / num_bits;
157+
let total_num_bytes = (total_num_bits + 7) / 8;
158158

159159
let mut r = Vec::<u8>::with_capacity(total_num_bytes + 7);
160160
r.resize(total_num_bytes + 7, 0);

build/ucd_generator/download.rs

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,43 @@
11

22
use std::io::Seek;
3+
use thiserror::Error;
4+
5+
#[derive(Error, Debug)]
6+
pub enum Error {
7+
#[error("Failed to write")]
8+
IO(#[from] std::io::Error),
9+
#[error("Failed to download")]
10+
Download(#[from] reqwest::Error),
11+
#[error("Failed to get file from server")]
12+
BadStatus(reqwest::StatusCode),
13+
}
314

4-
pub fn download_and_open_file(url: &str, path: &std::path::Path) -> Result<std::fs::File, String> {
15+
pub fn download_and_open_file(url: &str, path: &std::path::Path) -> Result<std::fs::File, Error> {
516
if std::fs::exists(&path).unwrap_or(false) {
617
// File already exists, no need to download.
7-
match std::fs::File::open(&path) {
8-
Err(e) => return Err(format!("Could not open file: {}", &e)),
9-
Ok(f) => return Ok(f),
10-
};
18+
let fd = std::fs::File::open(&path)?;
19+
return Ok(fd);
1120
}
1221

1322
if let Some(dir) = path.parent() {
1423
// Create the dir that the file is downloaded in.
1524
if !std::fs::exists(&dir).unwrap_or(false) {
16-
if let Err(e) = std::fs::create_dir_all(&dir) {
17-
return Err(format!("Could not create directory hierarchy {:?}: {}", &dir, e));
18-
}
25+
std::fs::create_dir_all(&dir)?;
1926
}
2027
}
2128

22-
let response = match reqwest::blocking::get(url) {
23-
Err(e) => return Err(format!("Could not request download: {}: {}", &url, e)),
24-
Ok(x) => x,
25-
};
26-
29+
let response = reqwest::blocking::get(url)?;
2730
if !response.status().is_success() {
28-
return Err(format!("Could not download: {}: {}", &url, response.status()));
31+
return Err(Error::BadStatus(response.status()));
2932
}
3033

31-
let body = match response.text() {
32-
Err(e) => return Err(format!("Could not get body when downloading: {}: {}", &url, e)),
33-
Ok(x) => x,
34-
};
35-
36-
let mut out_file = match std::fs::File::create_new(&path) {
37-
Err(e) => return Err(format!("Could not create file: {:?}: {}", &path, e)),
38-
Ok(x) => x,
39-
};
34+
let body = response.text()?;
4035

41-
if let Err(e) = std::io::copy(&mut body.as_bytes(), &mut out_file) {
42-
return Err(format!("Could not copy data to file: {:?}: {}", &path, e));
43-
}
36+
let mut fd = std::fs::File::create_new(&path)?;
37+
std::io::copy(&mut body.as_bytes(), &mut fd)?;
4438

4539
// Go to the first byte of the file, so that we can start reading from it.
46-
if let Err(e) = out_file.seek(std::io::SeekFrom::Start(0)) {
47-
return Err(format!("Could not seek file: {:?}: {}", &path, e));
48-
}
49-
50-
return Ok(out_file);
40+
fd.seek(std::io::SeekFrom::Start(0))?;
41+
return Ok(fd);
5142
}
5243

build/ucd_generator/generator.rs

Lines changed: 0 additions & 52 deletions
This file was deleted.

build/ucd_generator/generators.rs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
2+
use crate::ucd_generator::column;
3+
use std::io::Write;
4+
use thiserror::Error;
5+
use convert_case::{Case, Casing};
6+
7+
#[derive(Error, Debug)]
8+
pub enum Error {
9+
#[error("Failed to write")]
10+
IO(#[from] std::io::Error),
11+
#[error("Failed to format")]
12+
Formatting(#[from] std::fmt::Error),
13+
}
14+
15+
pub fn generate_enum_table(code_dir : &std::path::Path, name : &str, enum_values: &Vec<String>, column: &Vec<usize>, index: &Vec<usize>, chunk_size: usize) -> Result<(), Error>
16+
{
17+
let upper_name = name.to_case(Case::Constant);
18+
let camel_name = name.to_case(Case::Pascal);
19+
20+
let index_max_value = *index.iter().max().unwrap_or(&0);
21+
22+
let index_bits = (index_max_value + 1).next_power_of_two().trailing_zeros() as usize;
23+
let column_bits = enum_values.len().next_power_of_two().trailing_zeros() as usize;
24+
25+
let column_bytes = column::compress(column, column_bits);
26+
let index_bytes = column::compress(index, index_bits);
27+
28+
let code_path = code_dir.join(format!("{}.rs", &name));
29+
let mut fd = std::fs::File::create(&code_path)?;
30+
31+
write!(fd, "const {}_CHUNK_SIZE : usize = {};\n", upper_name, chunk_size)?;
32+
write!(fd, "const {}_COLUMN_BITS : usize = {};\n", upper_name, column_bits)?;
33+
write!(fd, "const {}_INDEX_LEN : usize = {};\n\n", upper_name, index.len())?;
34+
write!(fd, "const {}_INDEX_BITS : usize = {};\n\n", upper_name, index_bits)?;
35+
36+
write!(fd, "const {}_COLUMN: [u8; {}] = [", upper_name, column_bytes.len())?;
37+
for (i, v) in column_bytes.iter().enumerate() {
38+
if i % 32 == 0 {
39+
write!(fd, "\n ")?;
40+
}
41+
write!(fd, "{:3},", v)?;
42+
}
43+
write!(fd, "\n];\n\n")?;
44+
45+
write!(fd, "const {}_INDEX: [u8; {}] = [", upper_name, index_bytes.len())?;
46+
for (i, v) in index_bytes.iter().enumerate() {
47+
if i % 32 == 0 {
48+
write!(fd, "\n ")?;
49+
}
50+
write!(fd, "{:3},", v)?;
51+
}
52+
write!(fd, "\n];\n\n")?;
53+
54+
write!(fd, "#[derive(Debug,Clone,Copy,PartialEq)]\n")?;
55+
56+
write!(fd, "pub enum {} {{\n", camel_name)?;
57+
for (i, v) in enum_values.iter().enumerate() {
58+
write!(fd, " {} = {},\n", v, i)?;
59+
}
60+
write!(fd, "}}\n\n")?;
61+
62+
write!(fd, "pub const fn get_{}(code_point: char) -> {}\n", name, camel_name)?;
63+
write!(fd, "{{\n")?;
64+
write!(fd, " const INDEX_MASK : usize = (1 << {}_INDEX_BITS) - 1;\n", upper_name)?;
65+
write!(fd, " const COLUMN_MASK : usize = (1 << {}_COLUMN_BITS) - 1;\n", upper_name)?;
66+
67+
write!(fd, " let code_point_value = code_point as usize;\n")?;
68+
write!(fd, " let code_point_lo = code_point_value & {}_CHUNK_SIZE;\n", upper_name)?;
69+
write!(fd, " let mut code_point_hi = code_point_value / {}_CHUNK_SIZE;\n", upper_name)?;
70+
write!(fd, " if code_point_hi > {}_INDEX_LEN - 1 {{\n", upper_name)?;
71+
write!(fd, " code_point_hi = {}_INDEX_LEN - 1;\n", upper_name)?;
72+
write!(fd, " }}\n\n")?;
73+
74+
write!(fd, " let index_offset = code_point_hi * {}_INDEX_BITS;\n", upper_name)?;
75+
write!(fd, " let index_byte_offset = index_offset / 8;\n")?;
76+
write!(fd, " let index_bit_offset = index_offset % 8;\n")?;
77+
write!(fd, " let mut index: usize = 0;\n")?;
78+
let index_bytes_to_read = (index_bits + 7) / 8 + 1;
79+
for i in 0..index_bytes_to_read {
80+
write!(fd, " index |= ({}_INDEX[index_byte_offset + {}] as usize) << {};\n", upper_name, i, i * 8)?;
81+
}
82+
write!(fd, " index >>= index_bit_offset;\n")?;
83+
write!(fd, " index &= INDEX_MASK;\n\n")?;
84+
85+
write!(fd, " let column_offset = (index * {}_CHUNK_SIZE + code_point_lo) * {}_COLUMN_BITS;\n", upper_name, upper_name)?;
86+
write!(fd, " let column_byte_offset = column_offset / 8;\n")?;
87+
write!(fd, " let column_bit_offset = column_offset % 8;\n")?;
88+
write!(fd, " let mut value: usize = 0;\n")?;
89+
let column_bytes_to_read = (column_bits + 7) / 8 + 1;
90+
for i in 0..column_bytes_to_read {
91+
write!(fd, " value |= ({}_COLUMN[column_byte_offset + {}] as usize) << {};\n", upper_name, i, i * 8)?;
92+
}
93+
write!(fd, " value >>= column_bit_offset;\n")?;
94+
write!(fd, " value &= COLUMN_MASK;\n\n")?;
95+
96+
write!(fd, " return match value {{\n")?;
97+
for (i, v) in enum_values.iter().enumerate() {
98+
write!(fd, " {} => {}::{},\n", i, camel_name, v)?;
99+
}
100+
write!(fd, " _ => panic!(\"Invalid value.\")\n")?;
101+
write!(fd, " }};\n")?;
102+
write!(fd, "}}\n\n")?;
103+
104+
return Ok(());
105+
}

0 commit comments

Comments
 (0)