Skip to content

Commit

Permalink
Merge pull request #97 from LaurenzV/4.0
Browse files Browse the repository at this point in the history
Sync with 4.0.1
  • Loading branch information
RazrFalcon authored Feb 15, 2024
2 parents 660d0d0 + bdc935f commit 26fec9c
Show file tree
Hide file tree
Showing 16 changed files with 825 additions and 452 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ If you are interested in helping the project checkout
`rustybuzz` is a complete [harfbuzz](https://github.com/harfbuzz/harfbuzz)'s
shaping algorithm port to Rust.

Matches `harfbuzz` v2.9.1
Matches `harfbuzz` v4.0.1

## Why?

Expand Down
105 changes: 47 additions & 58 deletions scripts/gen-universal-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
headers[j - 1].append(line)
headers.append(["UnicodeData.txt does not have a header."])

data = [{} for _ in files]
unicode_data = [{} for _ in files]
values = [{} for _ in files]
for i, f in enumerate(files):
for line in f:
Expand Down Expand Up @@ -68,48 +68,23 @@

i0 = i if i < 7 else i - 7
for u in range(start, end + 1):
data[i0][u] = t
unicode_data[i0][u] = t
values[i0][t] = values[i0].get(t, 0) + end - start + 1

defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')

# TODO Characters that are not in Unicode Indic files, but used in USE
data[0][0x1B61] = defaults[0]
data[0][0x1B63] = defaults[0]
data[0][0x1B64] = defaults[0]
data[0][0x1B65] = defaults[0]
data[0][0x1B66] = defaults[0]
data[0][0x1B67] = defaults[0]
data[0][0x1B69] = defaults[0]
data[0][0x1B6A] = defaults[0]
data[0][0x2060] = defaults[0]
# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
data[0][0x1B5B] = 'Consonant_Placeholder'
data[0][0x1B5C] = 'Consonant_Placeholder'
data[0][0x1B5F] = 'Consonant_Placeholder'
data[0][0x1B62] = 'Consonant_Placeholder'
data[0][0x1B68] = 'Consonant_Placeholder'
# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
data[0][0x11C44] = 'Consonant_Placeholder'
data[0][0x11C45] = 'Consonant_Placeholder'
# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
data[0][0x111C8] = 'Consonant_Placeholder'

# Merge data into one dict:
for i, v in enumerate(defaults):
values[i][v] = values[i].get(v, 0) + 1
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i, d in enumerate(data):
for u, v in d.items():
for i,d in enumerate (unicode_data):
for u,v in d.items ():
if not u in combined:
if i >= 4:
continue
combined[u] = list(defaults)
combined[u] = list (defaults)
combined[u][i] = v
combined = {k: v for k, v in combined.items(
) if v[6] not in DISABLED_SCRIPTS}
data = combined
del combined
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}


property_names = [
Expand Down Expand Up @@ -234,8 +209,8 @@ def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):


def is_CGJ(U, UISC, UDI, UGC, AJT):
# Also includes VARIATION_SELECTOR, WJ, and ZWJ
return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
# Also includes VARIATION_SELECTOR and ZWJ
return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]


def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
Expand Down Expand Up @@ -303,12 +278,13 @@ def is_ZWNJ(U, UISC, UDI, UGC, AJT):


def is_OTHER(U, UISC, UDI, UGC, AJT):
# Also includes BASE_IND, Rsv, and SYM
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
# Also includes BASE_IND, and SYM
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
and not is_BASE(U, UISC, UDI, UGC, AJT)
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
and not is_CGJ(U, UISC, UDI, UGC, AJT)
and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
)


Expand All @@ -326,16 +302,20 @@ def is_SYM_MOD(U, UISC, UDI, UGC, AJT):


def is_VOWEL(U, UISC, UDI, UGC, AJT):
# https://github.com/harfbuzz/harfbuzz/issues/376
return (UISC == Pure_Killer or
(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
UGC != Lo and UISC in [Vowel, Vowel_Dependent])


def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
# https://github.com/harfbuzz/harfbuzz/issues/376
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
UGC != Lo and UISC == Bindu)

def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
# Also includes Rsv
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
and UISC == Other
and not is_CGJ(U, UISC, UDI, UGC, AJT)
) or UGC == Cn

use_mapping = {
'B': is_BASE,
Expand All @@ -362,6 +342,7 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
'SM': is_SYM_MOD,
'V': is_VOWEL,
'VM': is_VOWEL_MOD,
'WJ': is_Word_Joiner,
}

use_positions = {
Expand Down Expand Up @@ -435,10 +416,6 @@ def map_to_use(data):
if U == 0x1CED:
UISC = Tone_Mark

# TODO: https://github.com/microsoft/font-tools/issues/1
if U == 0xA982:
UISC = Consonant_Succeeding_Repha

values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
assert len(values) == 1, "%s %s %s %s %s %s" % (
hex(U), UISC, UDI, UGC, AJT, values)
Expand Down Expand Up @@ -482,19 +459,20 @@ def map_to_use(data):
return out


defaults = ('O', 'No_Block')
data = map_to_use(data)
use_data = map_to_use(combined)

print('// WARNING: this file was generated by ../scripts/gen-universal-table.py')
print()
print('use crate::GlyphInfo;')
print('use unicode_properties::GeneralCategory;')
print('use super::universal::{Category, category::*};')

total = 0
used = 0
last_block = None


def print_block(block, start, end, data):
def print_block(block, start, end, use_data):
global total, used, last_block
if block and block != last_block:
print()
Expand All @@ -509,18 +487,24 @@ def print_block(block, start, end, data):
if u % 16 == 0:
print()
print(' /* %04X */' % u, end='')
if u in data:
if u in use_data:
num += 1
d = data.get(u, defaults)
print('%6s,' % d[0], end='')
d = use_data.get(u)
if d is not None:
d = d[0]
elif u in unicode_data[4]:
d = 'O'
else:
d = 'WJ'
print("%6s," % d, end='')

total += end - start + 1
used += num
if block:
last_block = block


uu = sorted(data.keys())
uu = sorted(use_data.keys())

last = -100000
num = 0
Expand All @@ -534,19 +518,19 @@ def print_block(block, start, end, data):
for u in uu:
if u <= last:
continue
if data[u][0] == 'O':
if use_data[u][0] == 'O':
continue
block = data[u][1]
block = use_data[u][1]

start = u // 8 * 8
end = start + 1
while end in uu and block == data[end][1]:
while end in uu and block == use_data[end][1]:
end += 1
end = (end - 1) // 8 * 8 + 7

if start != last + 1:
if start - last <= 1 + 16 * 3:
print_block(None, last + 1, start - 1, data)
print_block(None, last + 1, start - 1, use_data)
last = start - 1
else:
if last >= 0:
Expand All @@ -556,7 +540,7 @@ def print_block(block, start, end, data):
(start, offset))
starts.append(start)

print_block(block, start, end, data)
print_block(block, start, end, use_data)
last = end
ends.append(last + 1)
offset += ends[-1] - starts[-1]
Expand All @@ -570,7 +554,8 @@ def print_block(block, start, end, data):
print(o)
print()
print('#[rustfmt::skip]')
print('pub fn get_category(u: u32) -> Category {')
print('pub fn get_category(info: &GlyphInfo) -> Category {')
print(' let u = info.glyph_id;')
print(' match u >> %d {' % page_bits)
pages = set([u >> page_bits for u in starts + ends])
for p in sorted(pages):
Expand All @@ -585,6 +570,10 @@ def print_block(block, start, end, data):
print(' _ => {}')
print(' }')
print()
print(' if info.general_category() == GeneralCategory::Unassigned {')
print(' return WJ;')
print(' }')
print()
print(' O')
print('}')

Expand Down
12 changes: 6 additions & 6 deletions scripts/ms-use/IndicPositionalCategory-Additional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Not derivable
# Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
# Updated for Unicode 10.0 by Andrew Glass 2017-07-25
# Amended for Unicode 10.0 by Andrew Glass 2018-09-21
# Ammended for Unicode 10.0 by Andrew Glass 2018-09-21
# Updated for L2/19-083 by Andrew Glass 2019-05-06
# Updated for Unicode 12.1 by Andrew Glass 2019-05-30
# Updated for Unicode 13.0 by Andrew Glass 2020-07-28
Expand Down Expand Up @@ -58,16 +58,16 @@ AA35   ; Top # Mn       CHAM CONSONANT SIGN
# Indic_Positional_Category=Bottom
0859..085B ; Bottom # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
18A9 ; Bottom # Mn MONGOLIAN LETTER ALI GALI DAGALGA
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overridden, ccc controls order
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overriden, ccc controls order
10AE6 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK BELOW
10F46..10F47 ; Bottom # Mn [2] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING TWO DOTS BELOW
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overridden, ccc controls order
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overriden, ccc controls order
10F4B ; Bottom # Mn SOGDIAN COMBINING CURVE BELOW
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overridden, ccc controls order
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overriden, ccc controls order
10F4D..10F50 ; Bottom # Mn [4] SOGDIAN COMBINING HOOK BELOW..SOGDIAN COMBINING STROKE BELOW
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overridden, ccc controls order
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overriden, ccc controls order
10F83 ; Bottom # Mn OLD UYGHUR COMBINING DOT BELOW
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overridden, ccc controls order
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overriden, ccc controls order
10F85 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS BELOW
16F4F ; Bottom # Mn MIAO SIGN CONSONANT MODIFIER BAR
16F51..16F87 ; Bottom # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
Expand Down
3 changes: 2 additions & 1 deletion src/aat/feature_mappings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
FeatureMapping::new(b"frac", FeatureType::Fractions, DIAGONAL_FRACTIONS, NO_FRACTIONS),
FeatureMapping::new(b"fwid", FeatureType::TextSpacing, MONOSPACED_TEXT, 7),
FeatureMapping::new(b"halt", FeatureType::TextSpacing, ALT_HALF_WIDTH_TEXT, 7),
FeatureMapping::new(b"hist", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
FeatureMapping::new(b"hist", FeatureType::Dummy, 0, 1),
FeatureMapping::new(b"hkna", FeatureType::AlternateKana, ALTERNATE_HORIZ_KANA_ON, ALTERNATE_HORIZ_KANA_OFF),
FeatureMapping::new(b"hlig", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
FeatureMapping::new(b"hngl", FeatureType::Transliteration, HANJA_TO_HANGUL, NO_TRANSLITERATION),
Expand Down Expand Up @@ -107,5 +107,6 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
FeatureMapping::new(b"vkna", FeatureType::AlternateKana, ALTERNATE_VERT_KANA_ON, ALTERNATE_VERT_KANA_OFF),
FeatureMapping::new(b"vpal", FeatureType::TextSpacing, ALT_PROPORTIONAL_TEXT, 7),
FeatureMapping::new(b"vrt2", FeatureType::VerticalSubstitution, SUBSTITUTE_VERTICAL_FORMS_ON, SUBSTITUTE_VERTICAL_FORMS_OFF),
FeatureMapping::new(b"vrtr", FeatureType::VerticalSubstitution, 2, 3),
FeatureMapping::new(b"zero", FeatureType::TypographicExtras, SLASHED_ZERO_ON, SLASHED_ZERO_OFF),
];
3 changes: 3 additions & 0 deletions src/aat/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ pub enum FeatureType {
ContextualAlternatives = 36,
LowerCase = 37,
UpperCase = 38,

// In harfbuzz, they just use the number 40 for "hist" but don't give it a name
Dummy = 40,
}

#[derive(Default)]
Expand Down
13 changes: 13 additions & 0 deletions src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ pub mod glyph_flag {
/// within text clusters.
///
/// The UNSAFE_TO_BREAK flag will always imply this flag.
/// To use this flag, you must enable the buffer flag
/// PRODUCE_UNSAFE_TO_CONCAT during shaping, otherwise
/// the buffer flag will not be reliably produced.
pub const UNSAFE_TO_CONCAT: u32 = 0x00000002;

/// All the currently defined flags.
Expand Down Expand Up @@ -1349,10 +1352,18 @@ impl Buffer {
}

pub fn unsafe_to_concat(&mut self, start: Option<usize>, end: Option<usize>) {
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
return;
}

self._set_glyph_flags(UNSAFE_TO_CONCAT, start, end, Some(true), None);
}

pub fn unsafe_to_break_from_outbuffer(&mut self, start: Option<usize>, end: Option<usize>) {
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
return;
}

self._set_glyph_flags(
UNSAFE_TO_BREAK | UNSAFE_TO_CONCAT,
start,
Expand Down Expand Up @@ -1720,6 +1731,8 @@ bitflags::bitflags! {
const DO_NOT_INSERT_DOTTED_CIRCLE = 1 << 5;
/// Indicates that the shape() call and its variants should perform various verification processes on the results of the shaping operation on the buffer. If the verification fails, then either a buffer message is sent, if a message handler is installed on the buffer, or a message is written to standard error. In either case, the shaping result might be modified to show the failed output.
const VERIFY = 1 << 6;
/// Indicates that the `UNSAFE_TO_CONCAT` glyph-flag should be produced by the shaper. By default it will not be produced since it incurs a cost.
const PRODUCE_UNSAFE_TO_CONCAT = 1 << 7;
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ pub mod script {
pub const TOTO: Script = Script::from_bytes(b"Toto");
pub const VITHKUQI: Script = Script::from_bytes(b"Vith");

pub const SCRIPT_MATH: Script = Script::from_bytes(b"Zmth");

// https://github.com/harfbuzz/harfbuzz/issues/1162
pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");
}
Expand Down
5 changes: 5 additions & 0 deletions src/complex/arabic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,11 @@ const MODIFIER_COMBINING_MARKS: &[u32] = &[
0x06E3, // ARABIC SMALL LOW SEEN
0x06E7, // ARABIC SMALL HIGH YEH
0x06E8, // ARABIC SMALL HIGH NOON
0x08CA, // ARABIC SMALL HIGH FARSI YEH
0x08CB, // ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW
0x08CD, // ARABIC SMALL HIGH ZAH
0x08CE, // ARABIC LARGE ROUND DOT ABOVE
0x08CF, // ARABIC LARGE ROUND DOT BELOW
0x08D3, // ARABIC SMALL LOW WAW
0x08F3, // ARABIC SMALL HIGH WAW
];
Expand Down
7 changes: 3 additions & 4 deletions src/complex/universal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ pub mod category {
pub const ZWNJ: u8 = 14; // Zero width non-joiner

// pub const ZWJ: u8 = 15; // Zero width joiner
// pub const WJ: u8 = 16; // Word joiner
pub const WJ: u8 = 16; // Word joiner

pub const RSV: u8 = 17; // Reserved characters
pub const R: u8 = 18; // REPHA
Expand Down Expand Up @@ -296,8 +296,7 @@ fn setup_topographical_masks(plan: &ShapePlan, buffer: &mut Buffer) {
let mut end = buffer.next_syllable(0);
while start < buffer.len {
let syllable = buffer.info[start].syllable() & 0x0F;
if syllable == SyllableType::SymbolCluster as u8
|| syllable == SyllableType::HieroglyphCluster as u8
if syllable == SyllableType::HieroglyphCluster as u8
|| syllable == SyllableType::NonCluster as u8
{
last_form = None;
Expand Down Expand Up @@ -539,6 +538,6 @@ fn setup_masks(plan: &ShapePlan, _: &Face, buffer: &mut Buffer) {
// We cannot setup masks here. We save information about characters
// and setup masks later on in a pause-callback.
for info in buffer.info_slice_mut() {
info.set_use_category(super::universal_table::get_category(info.glyph_id));
info.set_use_category(super::universal_table::get_category(info));
}
}
Loading

0 comments on commit 26fec9c

Please sign in to comment.