Skip to content

Commit 0f0b732

Browse files
authored
Merge pull request #92 from LaurenzV/3.0.0
Sync with 3.0.0.
2 parents 4a7642e + c8db555 commit 0f0b732

19 files changed

+346
-126
lines changed

Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ exclude = ["benches/", "tests/"]
1616
bitflags = "2.4.1"
1717
bytemuck = { version = "1.5", features = ["extern_crate_alloc"] }
1818
smallvec = "1.6"
19-
unicode-bidi-mirroring = "0.1"
20-
unicode-ccc = "0.1.2"
19+
unicode-bidi-mirroring = "0.2"
20+
unicode-ccc = "0.2"
2121
unicode-properties = { version = "0.1.0", default-features = false, features = ["general-category"] }
2222
unicode-script = "0.5.2"
2323
libm = { version = "0.2.2", optional = true }

scripts/gen-arabic-table.py

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
3+
# Based on harfbuzz/src/gen-arabic-table.py
4+
5+
import os
6+
import urllib.request
7+
8+
DEPENDENCIES = [
9+
"ArabicShaping.txt",
10+
"UnicodeData.txt",
11+
"Blocks.txt",
12+
]
13+
14+
for dep in DEPENDENCIES:
15+
if not os.path.exists(dep):
16+
urllib.request.urlretrieve("https://unicode.org/Public/14.0.0/ucd/" + dep, dep)
17+
18+
files = [open(x, encoding="utf-8") for x in DEPENDENCIES]
19+
20+
headers = [
21+
[files[0].readline(), files[0].readline()],
22+
[files[2].readline(), files[2].readline()],
23+
["UnicodeData.txt does not have a header."],
24+
]
25+
while files[0].readline().find("##################") < 0:
26+
pass
27+
28+
blocks = {}
29+
30+
31+
def read_blocks(f):
32+
global blocks
33+
for line in f:
34+
j = line.find("#")
35+
if j >= 0:
36+
line = line[:j]
37+
38+
fields = [x.strip() for x in line.split(";")]
39+
if len(fields) == 1:
40+
continue
41+
42+
uu = fields[0].split("..")
43+
start = int(uu[0], 16)
44+
if len(uu) == 1:
45+
end = start
46+
else:
47+
end = int(uu[1], 16)
48+
49+
t = fields[1]
50+
51+
for u in range(start, end + 1):
52+
blocks[u] = t
53+
54+
55+
def print_joining_table(f):
56+
values = {}
57+
for line in f:
58+
if line[0] == "#":
59+
continue
60+
61+
fields = [x.strip() for x in line.split(";")]
62+
if len(fields) == 1:
63+
continue
64+
65+
u = int(fields[0], 16)
66+
67+
if fields[3] in ["ALAPH", "DALATH RISH"]:
68+
value = "JOINING_GROUP_" + fields[3].replace(" ", "_")
69+
else:
70+
value = "JOINING_TYPE_" + fields[2]
71+
values[u] = value
72+
73+
short_value = {}
74+
for value in sorted(set([v for v in values.values()] + ["JOINING_TYPE_X"])):
75+
short = "".join(x[0] for x in value.split("_")[2:])
76+
assert short not in short_value.values()
77+
78+
short_value[value] = short
79+
80+
uu = sorted(values.keys())
81+
num = len(values)
82+
all_blocks = set([blocks[u] for u in uu])
83+
84+
last = -100000
85+
ranges = []
86+
for u in uu:
87+
if u - last <= 1 + 16 * 5:
88+
ranges[-1][-1] = u
89+
else:
90+
ranges.append([u, u])
91+
last = u
92+
93+
print("#[rustfmt::skip]")
94+
print("pub const JOINING_TABLE: &[JoiningType] = &[")
95+
last_block = None
96+
offset = 0
97+
98+
join_offsets = []
99+
100+
for start, end in ranges:
101+
join_offsets.append(
102+
"const JOINING_OFFSET_0X%04X: usize = %d;" % (start, offset)
103+
)
104+
105+
for u in range(start, end + 1):
106+
block = blocks.get(u, last_block)
107+
value = values.get(u, "JOINING_TYPE_X")
108+
109+
if block != last_block or u == start:
110+
if u != start:
111+
print()
112+
if block in all_blocks:
113+
print("\n /* %s */" % block)
114+
else:
115+
print("\n /* FILLER */")
116+
last_block = block
117+
if u % 32 != 0:
118+
print()
119+
print(" /* %04X */" % (u // 32 * 32), " " * (u % 32), end="")
120+
121+
if u % 32 == 0:
122+
print()
123+
print(" /* %04X */ " % u, end="")
124+
125+
val = short_value[value]
126+
127+
if val == "C":
128+
val = "D"
129+
130+
print("%s," % val, end="")
131+
print()
132+
133+
offset += end - start + 1
134+
print("];")
135+
print()
136+
137+
for offset in join_offsets:
138+
print(offset)
139+
140+
page_bits = 12
141+
print()
142+
print("pub fn joining_type(u: char) -> JoiningType {")
143+
print(" let u = u as u32;")
144+
print(" match u >> %d {" % page_bits)
145+
pages = set(
146+
[u >> page_bits for u in [s for s, e in ranges] + [e for s, e in ranges]]
147+
)
148+
for p in sorted(pages):
149+
print(" 0x%0X => {" % p)
150+
for start, end in ranges:
151+
if p not in [start >> page_bits, end >> page_bits]:
152+
continue
153+
offset = "JOINING_OFFSET_0X%04X" % start
154+
print(" if (0x%04X..=0x%04X).contains(&u) {" % (start, end))
155+
print(
156+
" return JOINING_TABLE[u as usize - 0x%04X + %s]"
157+
% (start, offset)
158+
)
159+
print(" }")
160+
print(" }")
161+
print(" _ => {}")
162+
print(" }")
163+
print()
164+
print(" X")
165+
print("}")
166+
print()
167+
168+
169+
print("// WARNING: this file was generated by ../scripts/gen-arabic-table.py")
170+
print()
171+
print(
172+
"use super::arabic::JoiningType::{self, GroupAlaph as A, GroupDalathRish as DR, D, L, R, T, U, X};"
173+
)
174+
print()
175+
176+
read_blocks(files[2])
177+
print_joining_table(files[0])

scripts/gen-indic-table.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import io
66
import os
7-
import sys
87
import urllib.request
98

109
DEPENDENCIES = [
@@ -15,7 +14,7 @@
1514

1615
for dep in DEPENDENCIES:
1716
if not os.path.exists(dep):
18-
urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/' + dep, dep)
17+
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/' + dep, dep)
1918

2019
ALLOWED_SINGLES = [0x00A0, 0x25CC]
2120
ALLOWED_BLOCKS = [

scripts/gen-unicode-norm-table.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import urllib.request
44
import os
55

6-
URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
6+
URL = 'https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
77
FILE_NAME = 'UnicodeData.txt'
88

99

@@ -22,7 +22,7 @@ def hex_to_char_rs(c):
2222
print('//! The current implementation is not the fastest one. Just good enough.')
2323
print()
2424
print('#[allow(dead_code)]')
25-
print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
25+
print('pub const UNICODE_VERSION: (u8, u8, u8) = (14, 0, 0);')
2626
print()
2727
print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
2828
print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')

scripts/gen-universal-table.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
for f in files:
1515
if not os.path.exists(f):
1616
urllib.request.urlretrieve(
17-
'https://unicode.org/Public/13.0.0/ucd/' + f, f)
17+
'https://unicode.org/Public/14.0.0/ucd/' + f, f)
1818

1919
files = [io.open(x, encoding='utf-8') for x in files]
2020

scripts/gen-vowel-constraints.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import urllib.request
1616

1717
if not os.path.exists('Scripts.txt'):
18-
urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/Scripts.txt', 'Scripts.txt')
18+
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/Scripts.txt', 'Scripts.txt')
1919

2020
with io.open('Scripts.txt', encoding='utf-8') as f:
2121
scripts_header = [f.readline() for i in range(2)]

src/common.rs

+10-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,10 @@ impl Direction {
129129

130130
// Unicode-13.0 additions
131131
script::CHORASMIAN |
132-
script::YEZIDI => {
132+
script::YEZIDI |
133+
134+
// Unicode-14.0 additions
135+
script::OLD_UYGHUR => {
133136
Some(Direction::RightToLeft)
134137
}
135138

@@ -442,6 +445,12 @@ pub mod script {
442445
pub const DIVES_AKURU: Script = Script::from_bytes(b"Diak");
443446
pub const KHITAN_SMALL_SCRIPT: Script = Script::from_bytes(b"Kits");
444447
pub const YEZIDI: Script = Script::from_bytes(b"Yezi");
448+
// Since 14.0
449+
pub const CYPRO_MINOAN: Script = Script::from_bytes(b"Cpmn");
450+
pub const OLD_UYGHUR: Script = Script::from_bytes(b"Ougr");
451+
pub const TANGSA: Script = Script::from_bytes(b"Tnsa");
452+
pub const TOTO: Script = Script::from_bytes(b"Toto");
453+
pub const VITHKUQI: Script = Script::from_bytes(b"Vith");
445454

446455
// https://github.com/harfbuzz/harfbuzz/issues/1162
447456
pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");

0 commit comments

Comments
 (0)