Skip to content

Commit 26fec9c

Browse files
authored
Merge pull request #97 from LaurenzV/4.0
Sync with 4.0.1
2 parents 660d0d0 + bdc935f commit 26fec9c

16 files changed

+825
-452
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ If you are interested in helping the project checkout
1919
`rustybuzz` is a complete [harfbuzz](https://github.com/harfbuzz/harfbuzz)'s
2020
shaping algorithm port to Rust.
2121

22-
Matches `harfbuzz` v2.9.1
22+
Matches `harfbuzz` v4.0.1
2323

2424
## Why?
2525

scripts/gen-universal-table.py

Lines changed: 47 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
headers[j - 1].append(line)
3535
headers.append(["UnicodeData.txt does not have a header."])
3636

37-
data = [{} for _ in files]
37+
unicode_data = [{} for _ in files]
3838
values = [{} for _ in files]
3939
for i, f in enumerate(files):
4040
for line in f:
@@ -68,48 +68,23 @@
6868

6969
i0 = i if i < 7 else i - 7
7070
for u in range(start, end + 1):
71-
data[i0][u] = t
71+
unicode_data[i0][u] = t
7272
values[i0][t] = values[i0].get(t, 0) + end - start + 1
7373

7474
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
7575

76-
# TODO Characters that are not in Unicode Indic files, but used in USE
77-
data[0][0x1B61] = defaults[0]
78-
data[0][0x1B63] = defaults[0]
79-
data[0][0x1B64] = defaults[0]
80-
data[0][0x1B65] = defaults[0]
81-
data[0][0x1B66] = defaults[0]
82-
data[0][0x1B67] = defaults[0]
83-
data[0][0x1B69] = defaults[0]
84-
data[0][0x1B6A] = defaults[0]
85-
data[0][0x2060] = defaults[0]
86-
# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
87-
data[0][0x1B5B] = 'Consonant_Placeholder'
88-
data[0][0x1B5C] = 'Consonant_Placeholder'
89-
data[0][0x1B5F] = 'Consonant_Placeholder'
90-
data[0][0x1B62] = 'Consonant_Placeholder'
91-
data[0][0x1B68] = 'Consonant_Placeholder'
92-
# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
93-
data[0][0x11C44] = 'Consonant_Placeholder'
94-
data[0][0x11C45] = 'Consonant_Placeholder'
95-
# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
96-
data[0][0x111C8] = 'Consonant_Placeholder'
97-
9876
# Merge data into one dict:
99-
for i, v in enumerate(defaults):
100-
values[i][v] = values[i].get(v, 0) + 1
77+
for i,v in enumerate (defaults):
78+
values[i][v] = values[i].get (v, 0) + 1
10179
combined = {}
102-
for i, d in enumerate(data):
103-
for u, v in d.items():
80+
for i,d in enumerate (unicode_data):
81+
for u,v in d.items ():
10482
if not u in combined:
10583
if i >= 4:
10684
continue
107-
combined[u] = list(defaults)
85+
combined[u] = list (defaults)
10886
combined[u][i] = v
109-
combined = {k: v for k, v in combined.items(
110-
) if v[6] not in DISABLED_SCRIPTS}
111-
data = combined
112-
del combined
87+
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
11388

11489

11590
property_names = [
@@ -234,8 +209,8 @@ def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
234209

235210

236211
def is_CGJ(U, UISC, UDI, UGC, AJT):
237-
# Also includes VARIATION_SELECTOR, WJ, and ZWJ
238-
return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
212+
# Also includes VARIATION_SELECTOR and ZWJ
213+
return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
239214

240215

241216
def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
@@ -303,12 +278,13 @@ def is_ZWNJ(U, UISC, UDI, UGC, AJT):
303278

304279

305280
def is_OTHER(U, UISC, UDI, UGC, AJT):
306-
# Also includes BASE_IND, Rsv, and SYM
307-
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
281+
# Also includes BASE_IND, and SYM
282+
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
308283
and not is_BASE(U, UISC, UDI, UGC, AJT)
309284
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
310285
and not is_CGJ(U, UISC, UDI, UGC, AJT)
311286
and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
287+
and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
312288
)
313289

314290

@@ -326,16 +302,20 @@ def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
326302

327303

328304
def is_VOWEL(U, UISC, UDI, UGC, AJT):
329-
# https://github.com/harfbuzz/harfbuzz/issues/376
330305
return (UISC == Pure_Killer or
331-
(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
306+
UGC != Lo and UISC in [Vowel, Vowel_Dependent])
332307

333308

334309
def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
335-
# https://github.com/harfbuzz/harfbuzz/issues/376
336310
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
337-
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
311+
UGC != Lo and UISC == Bindu)
338312

313+
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
314+
# Also includes Rsv
315+
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
316+
and UISC == Other
317+
and not is_CGJ(U, UISC, UDI, UGC, AJT)
318+
) or UGC == Cn
339319

340320
use_mapping = {
341321
'B': is_BASE,
@@ -362,6 +342,7 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
362342
'SM': is_SYM_MOD,
363343
'V': is_VOWEL,
364344
'VM': is_VOWEL_MOD,
345+
'WJ': is_Word_Joiner,
365346
}
366347

367348
use_positions = {
@@ -435,10 +416,6 @@ def map_to_use(data):
435416
if U == 0x1CED:
436417
UISC = Tone_Mark
437418

438-
# TODO: https://github.com/microsoft/font-tools/issues/1
439-
if U == 0xA982:
440-
UISC = Consonant_Succeeding_Repha
441-
442419
values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
443420
assert len(values) == 1, "%s %s %s %s %s %s" % (
444421
hex(U), UISC, UDI, UGC, AJT, values)
@@ -482,19 +459,20 @@ def map_to_use(data):
482459
return out
483460

484461

485-
defaults = ('O', 'No_Block')
486-
data = map_to_use(data)
462+
use_data = map_to_use(combined)
487463

488464
print('// WARNING: this file was generated by ../scripts/gen-universal-table.py')
489465
print()
466+
print('use crate::GlyphInfo;')
467+
print('use unicode_properties::GeneralCategory;')
490468
print('use super::universal::{Category, category::*};')
491469

492470
total = 0
493471
used = 0
494472
last_block = None
495473

496474

497-
def print_block(block, start, end, data):
475+
def print_block(block, start, end, use_data):
498476
global total, used, last_block
499477
if block and block != last_block:
500478
print()
@@ -509,18 +487,24 @@ def print_block(block, start, end, data):
509487
if u % 16 == 0:
510488
print()
511489
print(' /* %04X */' % u, end='')
512-
if u in data:
490+
if u in use_data:
513491
num += 1
514-
d = data.get(u, defaults)
515-
print('%6s,' % d[0], end='')
492+
d = use_data.get(u)
493+
if d is not None:
494+
d = d[0]
495+
elif u in unicode_data[4]:
496+
d = 'O'
497+
else:
498+
d = 'WJ'
499+
print("%6s," % d, end='')
516500

517501
total += end - start + 1
518502
used += num
519503
if block:
520504
last_block = block
521505

522506

523-
uu = sorted(data.keys())
507+
uu = sorted(use_data.keys())
524508

525509
last = -100000
526510
num = 0
@@ -534,19 +518,19 @@ def print_block(block, start, end, data):
534518
for u in uu:
535519
if u <= last:
536520
continue
537-
if data[u][0] == 'O':
521+
if use_data[u][0] == 'O':
538522
continue
539-
block = data[u][1]
523+
block = use_data[u][1]
540524

541525
start = u // 8 * 8
542526
end = start + 1
543-
while end in uu and block == data[end][1]:
527+
while end in uu and block == use_data[end][1]:
544528
end += 1
545529
end = (end - 1) // 8 * 8 + 7
546530

547531
if start != last + 1:
548532
if start - last <= 1 + 16 * 3:
549-
print_block(None, last + 1, start - 1, data)
533+
print_block(None, last + 1, start - 1, use_data)
550534
last = start - 1
551535
else:
552536
if last >= 0:
@@ -556,7 +540,7 @@ def print_block(block, start, end, data):
556540
(start, offset))
557541
starts.append(start)
558542

559-
print_block(block, start, end, data)
543+
print_block(block, start, end, use_data)
560544
last = end
561545
ends.append(last + 1)
562546
offset += ends[-1] - starts[-1]
@@ -570,7 +554,8 @@ def print_block(block, start, end, data):
570554
print(o)
571555
print()
572556
print('#[rustfmt::skip]')
573-
print('pub fn get_category(u: u32) -> Category {')
557+
print('pub fn get_category(info: &GlyphInfo) -> Category {')
558+
print(' let u = info.glyph_id;')
574559
print(' match u >> %d {' % page_bits)
575560
pages = set([u >> page_bits for u in starts + ends])
576561
for p in sorted(pages):
@@ -585,6 +570,10 @@ def print_block(block, start, end, data):
585570
print(' _ => {}')
586571
print(' }')
587572
print()
573+
print(' if info.general_category() == GeneralCategory::Unassigned {')
574+
print(' return WJ;')
575+
print(' }')
576+
print()
588577
print(' O')
589578
print('}')
590579

scripts/ms-use/IndicPositionalCategory-Additional.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Not derivable
33
# Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
44
# Updated for Unicode 10.0 by Andrew Glass 2017-07-25
5-
# Amended for Unicode 10.0 by Andrew Glass 2018-09-21
5+
# Ammended for Unicode 10.0 by Andrew Glass 2018-09-21
66
# Updated for L2/19-083 by Andrew Glass 2019-05-06
77
# Updated for Unicode 12.1 by Andrew Glass 2019-05-30
88
# Updated for Unicode 13.0 by Andrew Glass 2020-07-28
@@ -58,16 +58,16 @@ AA35   ; Top # Mn       CHAM CONSONANT SIGN
5858
# Indic_Positional_Category=Bottom
5959
0859..085B ; Bottom # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
6060
18A9 ; Bottom # Mn MONGOLIAN LETTER ALI GALI DAGALGA
61-
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overridden, ccc controls order
61+
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overriden, ccc controls order
6262
10AE6 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK BELOW
6363
10F46..10F47 ; Bottom # Mn [2] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING TWO DOTS BELOW
64-
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overridden, ccc controls order
64+
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overriden, ccc controls order
6565
10F4B ; Bottom # Mn SOGDIAN COMBINING CURVE BELOW
66-
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overridden, ccc controls order
66+
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overriden, ccc controls order
6767
10F4D..10F50 ; Bottom # Mn [4] SOGDIAN COMBINING HOOK BELOW..SOGDIAN COMBINING STROKE BELOW
68-
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overridden, ccc controls order
68+
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overriden, ccc controls order
6969
10F83 ; Bottom # Mn OLD UYGHUR COMBINING DOT BELOW
70-
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overridden, ccc controls order
70+
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overriden, ccc controls order
7171
10F85 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS BELOW
7272
16F4F ; Bottom # Mn MIAO SIGN CONSONANT MODIFIER BAR
7373
16F51..16F87 ; Bottom # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI

src/aat/feature_mappings.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
4545
FeatureMapping::new(b"frac", FeatureType::Fractions, DIAGONAL_FRACTIONS, NO_FRACTIONS),
4646
FeatureMapping::new(b"fwid", FeatureType::TextSpacing, MONOSPACED_TEXT, 7),
4747
FeatureMapping::new(b"halt", FeatureType::TextSpacing, ALT_HALF_WIDTH_TEXT, 7),
48-
FeatureMapping::new(b"hist", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
48+
FeatureMapping::new(b"hist", FeatureType::Dummy, 0, 1),
4949
FeatureMapping::new(b"hkna", FeatureType::AlternateKana, ALTERNATE_HORIZ_KANA_ON, ALTERNATE_HORIZ_KANA_OFF),
5050
FeatureMapping::new(b"hlig", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
5151
FeatureMapping::new(b"hngl", FeatureType::Transliteration, HANJA_TO_HANGUL, NO_TRANSLITERATION),
@@ -107,5 +107,6 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
107107
FeatureMapping::new(b"vkna", FeatureType::AlternateKana, ALTERNATE_VERT_KANA_ON, ALTERNATE_VERT_KANA_OFF),
108108
FeatureMapping::new(b"vpal", FeatureType::TextSpacing, ALT_PROPORTIONAL_TEXT, 7),
109109
FeatureMapping::new(b"vrt2", FeatureType::VerticalSubstitution, SUBSTITUTE_VERTICAL_FORMS_ON, SUBSTITUTE_VERTICAL_FORMS_OFF),
110+
FeatureMapping::new(b"vrtr", FeatureType::VerticalSubstitution, 2, 3),
110111
FeatureMapping::new(b"zero", FeatureType::TypographicExtras, SLASHED_ZERO_ON, SLASHED_ZERO_OFF),
111112
];

src/aat/map.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ pub enum FeatureType {
2727
ContextualAlternatives = 36,
2828
LowerCase = 37,
2929
UpperCase = 38,
30+
31+
// In harfbuzz, they just use the number 40 for "hist" but don't give it a name
32+
Dummy = 40,
3033
}
3134

3235
#[derive(Default)]

src/buffer.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ pub mod glyph_flag {
8484
/// within text clusters.
8585
///
8686
/// The UNSAFE_TO_BREAK flag will always imply this flag.
87+
/// To use this flag, you must enable the buffer flag
88+
/// PRODUCE_UNSAFE_TO_CONCAT during shaping, otherwise
89+
/// the buffer flag will not be reliably produced.
8790
pub const UNSAFE_TO_CONCAT: u32 = 0x00000002;
8891

8992
/// All the currently defined flags.
@@ -1349,10 +1352,18 @@ impl Buffer {
13491352
}
13501353

13511354
pub fn unsafe_to_concat(&mut self, start: Option<usize>, end: Option<usize>) {
1355+
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
1356+
return;
1357+
}
1358+
13521359
self._set_glyph_flags(UNSAFE_TO_CONCAT, start, end, Some(true), None);
13531360
}
13541361

13551362
pub fn unsafe_to_break_from_outbuffer(&mut self, start: Option<usize>, end: Option<usize>) {
1363+
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
1364+
return;
1365+
}
1366+
13561367
self._set_glyph_flags(
13571368
UNSAFE_TO_BREAK | UNSAFE_TO_CONCAT,
13581369
start,
@@ -1720,6 +1731,8 @@ bitflags::bitflags! {
17201731
const DO_NOT_INSERT_DOTTED_CIRCLE = 1 << 5;
17211732
/// Indicates that the shape() call and its variants should perform various verification processes on the results of the shaping operation on the buffer. If the verification fails, then either a buffer message is sent, if a message handler is installed on the buffer, or a message is written to standard error. In either case, the shaping result might be modified to show the failed output.
17221733
const VERIFY = 1 << 6;
1734+
/// Indicates that the `UNSAFE_TO_CONCAT` glyph-flag should be produced by the shaper. By default it will not be produced since it incurs a cost.
1735+
const PRODUCE_UNSAFE_TO_CONCAT = 1 << 7;
17231736
}
17241737
}
17251738

src/common.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,8 @@ pub mod script {
452452
pub const TOTO: Script = Script::from_bytes(b"Toto");
453453
pub const VITHKUQI: Script = Script::from_bytes(b"Vith");
454454

455+
pub const SCRIPT_MATH: Script = Script::from_bytes(b"Zmth");
456+
455457
// https://github.com/harfbuzz/harfbuzz/issues/1162
456458
pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");
457459
}

src/complex/arabic.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,11 @@ const MODIFIER_COMBINING_MARKS: &[u32] = &[
599599
0x06E3, // ARABIC SMALL LOW SEEN
600600
0x06E7, // ARABIC SMALL HIGH YEH
601601
0x06E8, // ARABIC SMALL HIGH NOON
602+
0x08CA, // ARABIC SMALL HIGH FARSI YEH
603+
0x08CB, // ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW
604+
0x08CD, // ARABIC SMALL HIGH ZAH
605+
0x08CE, // ARABIC LARGE ROUND DOT ABOVE
606+
0x08CF, // ARABIC LARGE ROUND DOT BELOW
602607
0x08D3, // ARABIC SMALL LOW WAW
603608
0x08F3, // ARABIC SMALL HIGH WAW
604609
];

src/complex/universal.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ pub mod category {
4848
pub const ZWNJ: u8 = 14; // Zero width non-joiner
4949

5050
// pub const ZWJ: u8 = 15; // Zero width joiner
51-
// pub const WJ: u8 = 16; // Word joiner
51+
pub const WJ: u8 = 16; // Word joiner
5252

5353
pub const RSV: u8 = 17; // Reserved characters
5454
pub const R: u8 = 18; // REPHA
@@ -296,8 +296,7 @@ fn setup_topographical_masks(plan: &ShapePlan, buffer: &mut Buffer) {
296296
let mut end = buffer.next_syllable(0);
297297
while start < buffer.len {
298298
let syllable = buffer.info[start].syllable() & 0x0F;
299-
if syllable == SyllableType::SymbolCluster as u8
300-
|| syllable == SyllableType::HieroglyphCluster as u8
299+
if syllable == SyllableType::HieroglyphCluster as u8
301300
|| syllable == SyllableType::NonCluster as u8
302301
{
303302
last_form = None;
@@ -539,6 +538,6 @@ fn setup_masks(plan: &ShapePlan, _: &Face, buffer: &mut Buffer) {
539538
// We cannot setup masks here. We save information about characters
540539
// and setup masks later on in a pause-callback.
541540
for info in buffer.info_slice_mut() {
542-
info.set_use_category(super::universal_table::get_category(info.glyph_id));
541+
info.set_use_category(super::universal_table::get_category(info));
543542
}
544543
}

0 commit comments

Comments
 (0)