Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync with 4.0.1 #97

Merged
merged 12 commits into from
Feb 15, 2024
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ If you are interested in helping the project checkout
`rustybuzz` is a complete [harfbuzz](https://github.com/harfbuzz/harfbuzz)'s
shaping algorithm port to Rust.

Matches `harfbuzz` v2.9.1
Matches `harfbuzz` v4.0.1

## Why?

Expand Down
105 changes: 47 additions & 58 deletions scripts/gen-universal-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
headers[j - 1].append(line)
headers.append(["UnicodeData.txt does not have a header."])

data = [{} for _ in files]
unicode_data = [{} for _ in files]
values = [{} for _ in files]
for i, f in enumerate(files):
for line in f:
Expand Down Expand Up @@ -68,48 +68,23 @@

i0 = i if i < 7 else i - 7
for u in range(start, end + 1):
data[i0][u] = t
unicode_data[i0][u] = t
values[i0][t] = values[i0].get(t, 0) + end - start + 1

defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')

# TODO Characters that are not in Unicode Indic files, but used in USE
data[0][0x1B61] = defaults[0]
data[0][0x1B63] = defaults[0]
data[0][0x1B64] = defaults[0]
data[0][0x1B65] = defaults[0]
data[0][0x1B66] = defaults[0]
data[0][0x1B67] = defaults[0]
data[0][0x1B69] = defaults[0]
data[0][0x1B6A] = defaults[0]
data[0][0x2060] = defaults[0]
# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
data[0][0x1B5B] = 'Consonant_Placeholder'
data[0][0x1B5C] = 'Consonant_Placeholder'
data[0][0x1B5F] = 'Consonant_Placeholder'
data[0][0x1B62] = 'Consonant_Placeholder'
data[0][0x1B68] = 'Consonant_Placeholder'
# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
data[0][0x11C44] = 'Consonant_Placeholder'
data[0][0x11C45] = 'Consonant_Placeholder'
# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
data[0][0x111C8] = 'Consonant_Placeholder'

# Merge data into one dict:
for i, v in enumerate(defaults):
values[i][v] = values[i].get(v, 0) + 1
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i, d in enumerate(data):
for u, v in d.items():
for i,d in enumerate (unicode_data):
for u,v in d.items ():
if not u in combined:
if i >= 4:
continue
combined[u] = list(defaults)
combined[u] = list (defaults)
combined[u][i] = v
combined = {k: v for k, v in combined.items(
) if v[6] not in DISABLED_SCRIPTS}
data = combined
del combined
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}


property_names = [
Expand Down Expand Up @@ -234,8 +209,8 @@ def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):


def is_CGJ(U, UISC, UDI, UGC, AJT):
# Also includes VARIATION_SELECTOR, WJ, and ZWJ
return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
# Also includes VARIATION_SELECTOR and ZWJ
return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]


def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
Expand Down Expand Up @@ -303,12 +278,13 @@ def is_ZWNJ(U, UISC, UDI, UGC, AJT):


def is_OTHER(U, UISC, UDI, UGC, AJT):
# Also includes BASE_IND, Rsv, and SYM
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
# Also includes BASE_IND, and SYM
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
and not is_BASE(U, UISC, UDI, UGC, AJT)
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
and not is_CGJ(U, UISC, UDI, UGC, AJT)
and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
)


Expand All @@ -326,16 +302,20 @@ def is_SYM_MOD(U, UISC, UDI, UGC, AJT):


def is_VOWEL(U, UISC, UDI, UGC, AJT):
# https://github.com/harfbuzz/harfbuzz/issues/376
return (UISC == Pure_Killer or
(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
UGC != Lo and UISC in [Vowel, Vowel_Dependent])


def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
# https://github.com/harfbuzz/harfbuzz/issues/376
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
UGC != Lo and UISC == Bindu)

def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
# Also includes Rsv
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
and UISC == Other
and not is_CGJ(U, UISC, UDI, UGC, AJT)
) or UGC == Cn

use_mapping = {
'B': is_BASE,
Expand All @@ -362,6 +342,7 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
'SM': is_SYM_MOD,
'V': is_VOWEL,
'VM': is_VOWEL_MOD,
'WJ': is_Word_Joiner,
}

use_positions = {
Expand Down Expand Up @@ -435,10 +416,6 @@ def map_to_use(data):
if U == 0x1CED:
UISC = Tone_Mark

# TODO: https://github.com/microsoft/font-tools/issues/1
if U == 0xA982:
UISC = Consonant_Succeeding_Repha

values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
assert len(values) == 1, "%s %s %s %s %s %s" % (
hex(U), UISC, UDI, UGC, AJT, values)
Expand Down Expand Up @@ -482,19 +459,20 @@ def map_to_use(data):
return out


defaults = ('O', 'No_Block')
data = map_to_use(data)
use_data = map_to_use(combined)

print('// WARNING: this file was generated by ../scripts/gen-universal-table.py')
print()
print('use crate::GlyphInfo;')
print('use unicode_properties::GeneralCategory;')
print('use super::universal::{Category, category::*};')

total = 0
used = 0
last_block = None


def print_block(block, start, end, data):
def print_block(block, start, end, use_data):
global total, used, last_block
if block and block != last_block:
print()
Expand All @@ -509,18 +487,24 @@ def print_block(block, start, end, data):
if u % 16 == 0:
print()
print(' /* %04X */' % u, end='')
if u in data:
if u in use_data:
num += 1
d = data.get(u, defaults)
print('%6s,' % d[0], end='')
d = use_data.get(u)
if d is not None:
d = d[0]
elif u in unicode_data[4]:
d = 'O'
else:
d = 'WJ'
print("%6s," % d, end='')

total += end - start + 1
used += num
if block:
last_block = block


uu = sorted(data.keys())
uu = sorted(use_data.keys())

last = -100000
num = 0
Expand All @@ -534,19 +518,19 @@ def print_block(block, start, end, data):
for u in uu:
if u <= last:
continue
if data[u][0] == 'O':
if use_data[u][0] == 'O':
continue
block = data[u][1]
block = use_data[u][1]

start = u // 8 * 8
end = start + 1
while end in uu and block == data[end][1]:
while end in uu and block == use_data[end][1]:
end += 1
end = (end - 1) // 8 * 8 + 7

if start != last + 1:
if start - last <= 1 + 16 * 3:
print_block(None, last + 1, start - 1, data)
print_block(None, last + 1, start - 1, use_data)
last = start - 1
else:
if last >= 0:
Expand All @@ -556,7 +540,7 @@ def print_block(block, start, end, data):
(start, offset))
starts.append(start)

print_block(block, start, end, data)
print_block(block, start, end, use_data)
last = end
ends.append(last + 1)
offset += ends[-1] - starts[-1]
Expand All @@ -570,7 +554,8 @@ def print_block(block, start, end, data):
print(o)
print()
print('#[rustfmt::skip]')
print('pub fn get_category(u: u32) -> Category {')
print('pub fn get_category(info: &GlyphInfo) -> Category {')
print(' let u = info.glyph_id;')
print(' match u >> %d {' % page_bits)
pages = set([u >> page_bits for u in starts + ends])
for p in sorted(pages):
Expand All @@ -585,6 +570,10 @@ def print_block(block, start, end, data):
print(' _ => {}')
print(' }')
print()
print(' if info.general_category() == GeneralCategory::Unassigned {')
print(' return WJ;')
print(' }')
print()
print(' O')
print('}')

Expand Down
12 changes: 6 additions & 6 deletions scripts/ms-use/IndicPositionalCategory-Additional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Not derivable
# Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
# Updated for Unicode 10.0 by Andrew Glass 2017-07-25
# Amended for Unicode 10.0 by Andrew Glass 2018-09-21
# Ammended for Unicode 10.0 by Andrew Glass 2018-09-21
# Updated for L2/19-083 by Andrew Glass 2019-05-06
# Updated for Unicode 12.1 by Andrew Glass 2019-05-30
# Updated for Unicode 13.0 by Andrew Glass 2020-07-28
Expand Down Expand Up @@ -58,16 +58,16 @@ AA35   ; Top # Mn       CHAM CONSONANT SIGN
# Indic_Positional_Category=Bottom
0859..085B ; Bottom # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
18A9 ; Bottom # Mn MONGOLIAN LETTER ALI GALI DAGALGA
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overridden, ccc controls order
10AE5 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK ABOVE # Overriden, ccc controls order
10AE6 ; Bottom # Mn MANICHAEAN ABBREVIATION MARK BELOW
10F46..10F47 ; Bottom # Mn [2] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING TWO DOTS BELOW
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overridden, ccc controls order
10F48..10F4A ; Bottom # Mn [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE # Overriden, ccc controls order
10F4B ; Bottom # Mn SOGDIAN COMBINING CURVE BELOW
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overridden, ccc controls order
10F4C ; Bottom # Mn SOGDIAN COMBINING HOOK ABOVE # Overriden, ccc controls order
10F4D..10F50 ; Bottom # Mn [4] SOGDIAN COMBINING HOOK BELOW..SOGDIAN COMBINING STROKE BELOW
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overridden, ccc controls order
10F82 ; Bottom # Mn OLD UYGHUR COMBINING DOT ABOVE # Overriden, ccc controls order
10F83 ; Bottom # Mn OLD UYGHUR COMBINING DOT BELOW
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overridden, ccc controls order
10F84 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS ABOVE # Overriden, ccc controls order
10F85 ; Bottom # Mn OLD UYGHUR COMBINING TWO DOTS BELOW
16F4F ; Bottom # Mn MIAO SIGN CONSONANT MODIFIER BAR
16F51..16F87 ; Bottom # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
Expand Down
3 changes: 2 additions & 1 deletion src/aat/feature_mappings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
FeatureMapping::new(b"frac", FeatureType::Fractions, DIAGONAL_FRACTIONS, NO_FRACTIONS),
FeatureMapping::new(b"fwid", FeatureType::TextSpacing, MONOSPACED_TEXT, 7),
FeatureMapping::new(b"halt", FeatureType::TextSpacing, ALT_HALF_WIDTH_TEXT, 7),
FeatureMapping::new(b"hist", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
FeatureMapping::new(b"hist", FeatureType::Dummy, 0, 1),
FeatureMapping::new(b"hkna", FeatureType::AlternateKana, ALTERNATE_HORIZ_KANA_ON, ALTERNATE_HORIZ_KANA_OFF),
FeatureMapping::new(b"hlig", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
FeatureMapping::new(b"hngl", FeatureType::Transliteration, HANJA_TO_HANGUL, NO_TRANSLITERATION),
Expand Down Expand Up @@ -107,5 +107,6 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
FeatureMapping::new(b"vkna", FeatureType::AlternateKana, ALTERNATE_VERT_KANA_ON, ALTERNATE_VERT_KANA_OFF),
FeatureMapping::new(b"vpal", FeatureType::TextSpacing, ALT_PROPORTIONAL_TEXT, 7),
FeatureMapping::new(b"vrt2", FeatureType::VerticalSubstitution, SUBSTITUTE_VERTICAL_FORMS_ON, SUBSTITUTE_VERTICAL_FORMS_OFF),
FeatureMapping::new(b"vrtr", FeatureType::VerticalSubstitution, 2, 3),
FeatureMapping::new(b"zero", FeatureType::TypographicExtras, SLASHED_ZERO_ON, SLASHED_ZERO_OFF),
];
3 changes: 3 additions & 0 deletions src/aat/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ pub enum FeatureType {
ContextualAlternatives = 36,
LowerCase = 37,
UpperCase = 38,

// In harfbuzz, they just use the number 40 for "hist" but don't give it a name
Dummy = 40,
}

#[derive(Default)]
Expand Down
13 changes: 13 additions & 0 deletions src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ pub mod glyph_flag {
/// within text clusters.
///
/// The UNSAFE_TO_BREAK flag will always imply this flag.
/// To use this flag, you must enable the buffer flag
/// PRODUCE_UNSAFE_TO_CONCAT during shaping, otherwise
/// the buffer flag will not be reliably produced.
pub const UNSAFE_TO_CONCAT: u32 = 0x00000002;

/// All the currently defined flags.
Expand Down Expand Up @@ -1349,10 +1352,18 @@ impl Buffer {
}

pub fn unsafe_to_concat(&mut self, start: Option<usize>, end: Option<usize>) {
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
return;
}

self._set_glyph_flags(UNSAFE_TO_CONCAT, start, end, Some(true), None);
}

pub fn unsafe_to_break_from_outbuffer(&mut self, start: Option<usize>, end: Option<usize>) {
if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
return;
}

self._set_glyph_flags(
UNSAFE_TO_BREAK | UNSAFE_TO_CONCAT,
start,
Expand Down Expand Up @@ -1720,6 +1731,8 @@ bitflags::bitflags! {
const DO_NOT_INSERT_DOTTED_CIRCLE = 1 << 5;
/// Indicates that the shape() call and its variants should perform various verification processes on the results of the shaping operation on the buffer. If the verification fails, then either a buffer message is sent, if a message handler is installed on the buffer, or a message is written to standard error. In either case, the shaping result might be modified to show the failed output.
const VERIFY = 1 << 6;
/// Indicates that the `UNSAFE_TO_CONCAT` glyph-flag should be produced by the shaper. By default it will not be produced since it incurs a cost.
const PRODUCE_UNSAFE_TO_CONCAT = 1 << 7;
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ pub mod script {
pub const TOTO: Script = Script::from_bytes(b"Toto");
pub const VITHKUQI: Script = Script::from_bytes(b"Vith");

pub const SCRIPT_MATH: Script = Script::from_bytes(b"Zmth");

// https://github.com/harfbuzz/harfbuzz/issues/1162
pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");
}
Expand Down
5 changes: 5 additions & 0 deletions src/complex/arabic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,11 @@ const MODIFIER_COMBINING_MARKS: &[u32] = &[
0x06E3, // ARABIC SMALL LOW SEEN
0x06E7, // ARABIC SMALL HIGH YEH
0x06E8, // ARABIC SMALL HIGH NOON
0x08CA, // ARABIC SMALL HIGH FARSI YEH
0x08CB, // ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW
0x08CD, // ARABIC SMALL HIGH ZAH
0x08CE, // ARABIC LARGE ROUND DOT ABOVE
0x08CF, // ARABIC LARGE ROUND DOT BELOW
0x08D3, // ARABIC SMALL LOW WAW
0x08F3, // ARABIC SMALL HIGH WAW
];
Expand Down
7 changes: 3 additions & 4 deletions src/complex/universal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ pub mod category {
pub const ZWNJ: u8 = 14; // Zero width non-joiner

// pub const ZWJ: u8 = 15; // Zero width joiner
// pub const WJ: u8 = 16; // Word joiner
pub const WJ: u8 = 16; // Word joiner

pub const RSV: u8 = 17; // Reserved characters
pub const R: u8 = 18; // REPHA
Expand Down Expand Up @@ -296,8 +296,7 @@ fn setup_topographical_masks(plan: &ShapePlan, buffer: &mut Buffer) {
let mut end = buffer.next_syllable(0);
while start < buffer.len {
let syllable = buffer.info[start].syllable() & 0x0F;
if syllable == SyllableType::SymbolCluster as u8
|| syllable == SyllableType::HieroglyphCluster as u8
if syllable == SyllableType::HieroglyphCluster as u8
|| syllable == SyllableType::NonCluster as u8
{
last_form = None;
Expand Down Expand Up @@ -539,6 +538,6 @@ fn setup_masks(plan: &ShapePlan, _: &Face, buffer: &mut Buffer) {
// We cannot setup masks here. We save information about characters
// and setup masks later on in a pause-callback.
for info in buffer.info_slice_mut() {
info.set_use_category(super::universal_table::get_category(info.glyph_id));
info.set_use_category(super::universal_table::get_category(info));
}
}
Loading
Loading