harfbuzz · RazrFalcon · Feb 15, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 15, 2024
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ If you are interested in helping the project checkout
 `rustybuzz` is a complete [harfbuzz](https://github.com/harfbuzz/harfbuzz)'s
 shaping algorithm port to Rust.
 
-Matches `harfbuzz` v2.9.1
+Matches `harfbuzz` v4.0.1
 
 ## Why?
 

diff --git a/scripts/gen-universal-table.py b/scripts/gen-universal-table.py
@@ -34,7 +34,7 @@
         headers[j - 1].append(line)
 headers.append(["UnicodeData.txt does not have a header."])
 
-data = [{} for _ in files]
+unicode_data = [{} for _ in files]
 values = [{} for _ in files]
 for i, f in enumerate(files):
     for line in f:
@@ -68,48 +68,23 @@
 
         i0 = i if i < 7 else i - 7
         for u in range(start, end + 1):
-            data[i0][u] = t
+            unicode_data[i0][u] = t
         values[i0][t] = values[i0].get(t, 0) + end - start + 1
 
 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
 
-# TODO Characters that are not in Unicode Indic files, but used in USE
-data[0][0x1B61] = defaults[0]
-data[0][0x1B63] = defaults[0]
-data[0][0x1B64] = defaults[0]
-data[0][0x1B65] = defaults[0]
-data[0][0x1B66] = defaults[0]
-data[0][0x1B67] = defaults[0]
-data[0][0x1B69] = defaults[0]
-data[0][0x1B6A] = defaults[0]
-data[0][0x2060] = defaults[0]
-# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
-data[0][0x1B5B] = 'Consonant_Placeholder'
-data[0][0x1B5C] = 'Consonant_Placeholder'
-data[0][0x1B5F] = 'Consonant_Placeholder'
-data[0][0x1B62] = 'Consonant_Placeholder'
-data[0][0x1B68] = 'Consonant_Placeholder'
-# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
-data[0][0x11C44] = 'Consonant_Placeholder'
-data[0][0x11C45] = 'Consonant_Placeholder'
-# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
-data[0][0x111C8] = 'Consonant_Placeholder'
-
 # Merge data into one dict:
-for i, v in enumerate(defaults):
-    values[i][v] = values[i].get(v, 0) + 1
+for i,v in enumerate (defaults):
+    values[i][v] = values[i].get (v, 0) + 1
 combined = {}
-for i, d in enumerate(data):
-    for u, v in d.items():
+for i,d in enumerate (unicode_data):
+    for u,v in d.items ():
         if not u in combined:
             if i >= 4:
                 continue
-            combined[u] = list(defaults)
+            combined[u] = list (defaults)
         combined[u][i] = v
-combined = {k: v for k, v in combined.items(
-) if v[6] not in DISABLED_SCRIPTS}
-data = combined
-del combined
+combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
 
 
 property_names = [
@@ -234,8 +209,8 @@ def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
 
 
 def is_CGJ(U, UISC, UDI, UGC, AJT):
-    # Also includes VARIATION_SELECTOR, WJ, and ZWJ
-    return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
+    # Also includes VARIATION_SELECTOR and ZWJ
+    return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
 
 
 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
@@ -303,12 +278,13 @@ def is_ZWNJ(U, UISC, UDI, UGC, AJT):
 
 
 def is_OTHER(U, UISC, UDI, UGC, AJT):
-    # Also includes BASE_IND, Rsv, and SYM
-    return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
+    # Also includes BASE_IND, and SYM
+    return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
             and not is_BASE(U, UISC, UDI, UGC, AJT)
             and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
             and not is_CGJ(U, UISC, UDI, UGC, AJT)
             and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
+            and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
             )
 
 
@@ -326,16 +302,20 @@ def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
 
 
 def is_VOWEL(U, UISC, UDI, UGC, AJT):
-    # https://github.com/harfbuzz/harfbuzz/issues/376
     return (UISC == Pure_Killer or
-            (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
+            UGC != Lo and UISC in [Vowel, Vowel_Dependent])
 
 
 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
-    # https://github.com/harfbuzz/harfbuzz/issues/376
     return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
-            (UGC != Lo and (UISC == Bindu or U in [0xAA29])))
+            UGC != Lo and UISC == Bindu)
 
+def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
+    # Also includes Rsv
+    return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
+            and UISC == Other
+            and not is_CGJ(U, UISC, UDI, UGC, AJT)
+            ) or UGC == Cn
 
 use_mapping = {
     'B': is_BASE,
@@ -362,6 +342,7 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
     'SM': is_SYM_MOD,
     'V': is_VOWEL,
     'VM': is_VOWEL_MOD,
+    'WJ': is_Word_Joiner,
 }
 
 use_positions = {
@@ -435,10 +416,6 @@ def map_to_use(data):
         if U == 0x1CED:
             UISC = Tone_Mark
 
-        # TODO: https://github.com/microsoft/font-tools/issues/1
-        if U == 0xA982:
-            UISC = Consonant_Succeeding_Repha
-
         values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
         assert len(values) == 1, "%s %s %s %s %s %s" % (
             hex(U), UISC, UDI, UGC, AJT, values)
@@ -482,19 +459,20 @@ def map_to_use(data):
     return out
 
 
-defaults = ('O', 'No_Block')
-data = map_to_use(data)
+use_data = map_to_use(combined)
 
 print('// WARNING: this file was generated by ../scripts/gen-universal-table.py')
 print()
+print('use crate::GlyphInfo;')
+print('use unicode_properties::GeneralCategory;')
 print('use super::universal::{Category, category::*};')
 
 total = 0
 used = 0
 last_block = None
 
 
-def print_block(block, start, end, data):
+def print_block(block, start, end, use_data):
     global total, used, last_block
     if block and block != last_block:
         print()
@@ -509,18 +487,24 @@ def print_block(block, start, end, data):
         if u % 16 == 0:
             print()
             print('  /* %04X */' % u, end='')
-        if u in data:
+        if u in use_data:
             num += 1
-        d = data.get(u, defaults)
-        print('%6s,' % d[0], end='')
+        d = use_data.get(u)
+        if d is not None:
+            d = d[0]
+        elif u in unicode_data[4]:
+            d = 'O'
+        else:
+            d = 'WJ'
+        print("%6s," % d, end='')
 
     total += end - start + 1
     used += num
     if block:
         last_block = block
 
 
-uu = sorted(data.keys())
+uu = sorted(use_data.keys())
 
 last = -100000
 num = 0
@@ -534,19 +518,19 @@ def print_block(block, start, end, data):
 for u in uu:
     if u <= last:
         continue
-    if data[u][0] == 'O':
+    if use_data[u][0] == 'O':
         continue
-    block = data[u][1]
+    block = use_data[u][1]
 
     start = u // 8 * 8
     end = start + 1
-    while end in uu and block == data[end][1]:
+    while end in uu and block == use_data[end][1]:
         end += 1
     end = (end - 1) // 8 * 8 + 7
 
     if start != last + 1:
         if start - last <= 1 + 16 * 3:
-            print_block(None, last + 1, start - 1, data)
+            print_block(None, last + 1, start - 1, use_data)
             last = start - 1
         else:
             if last >= 0:
@@ -556,7 +540,7 @@ def print_block(block, start, end, data):
                            (start, offset))
             starts.append(start)
 
-    print_block(block, start, end, data)
+    print_block(block, start, end, use_data)
     last = end
 ends.append(last + 1)
 offset += ends[-1] - starts[-1]
@@ -570,7 +554,8 @@ def print_block(block, start, end, data):
     print(o)
 print()
 print('#[rustfmt::skip]')
-print('pub fn get_category(u: u32) -> Category {')
+print('pub fn get_category(info: &GlyphInfo) -> Category {')
+print('    let u = info.glyph_id;')
 print('    match u >> %d {' % page_bits)
 pages = set([u >> page_bits for u in starts + ends])
 for p in sorted(pages):
@@ -585,6 +570,10 @@ def print_block(block, start, end, data):
 print('        _ => {}')
 print('    }')
 print()
+print('    if info.general_category() == GeneralCategory::Unassigned {')
+print('        return WJ;')
+print('    }')
+print()
 print('    O')
 print('}')
 

diff --git a/scripts/ms-use/IndicPositionalCategory-Additional.txt b/scripts/ms-use/IndicPositionalCategory-Additional.txt
@@ -2,7 +2,7 @@
 # Not derivable
 # Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17
 # Updated for Unicode 10.0 by Andrew Glass 2017-07-25
-# Amended for Unicode 10.0 by Andrew Glass 2018-09-21
+# Ammended for Unicode 10.0 by Andrew Glass 2018-09-21
 # Updated for L2/19-083    by Andrew Glass 2019-05-06
 # Updated for Unicode 12.1 by Andrew Glass 2019-05-30
 # Updated for Unicode 13.0 by Andrew Glass 2020-07-28
@@ -58,16 +58,16 @@ AA35          ; Top     # Mn       CHAM CONSONANT SIGN
 # Indic_Positional_Category=Bottom
 0859..085B    ; Bottom # Mn   [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
 18A9          ; Bottom # Mn       MONGOLIAN LETTER ALI GALI DAGALGA
-10AE5         ; Bottom # Mn       MANICHAEAN ABBREVIATION MARK ABOVE  # Overridden, ccc controls order
+10AE5         ; Bottom # Mn       MANICHAEAN ABBREVIATION MARK ABOVE  # Overriden, ccc controls order
 10AE6         ; Bottom # Mn       MANICHAEAN ABBREVIATION MARK BELOW
 10F46..10F47  ; Bottom # Mn   [2] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING TWO DOTS BELOW
-10F48..10F4A  ; Bottom # Mn   [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE     # Overridden, ccc controls order
+10F48..10F4A  ; Bottom # Mn   [3] SOGDIAN COMBINING DOT ABOVE..SOGDIAN COMBINING CURVE ABOVE     # Overriden, ccc controls order
 10F4B         ; Bottom # Mn       SOGDIAN COMBINING CURVE BELOW
-10F4C         ; Bottom # Mn       SOGDIAN COMBINING HOOK ABOVE        # Overridden, ccc controls order
+10F4C         ; Bottom # Mn       SOGDIAN COMBINING HOOK ABOVE        # Overriden, ccc controls order
 10F4D..10F50  ; Bottom # Mn   [4] SOGDIAN COMBINING HOOK BELOW..SOGDIAN COMBINING STROKE BELOW
-10F82         ; Bottom # Mn       OLD UYGHUR COMBINING DOT ABOVE      # Overridden, ccc controls order
+10F82         ; Bottom # Mn       OLD UYGHUR COMBINING DOT ABOVE      # Overriden, ccc controls order
 10F83         ; Bottom # Mn       OLD UYGHUR COMBINING DOT BELOW
-10F84         ; Bottom # Mn       OLD UYGHUR COMBINING TWO DOTS ABOVE # Overridden, ccc controls order
+10F84         ; Bottom # Mn       OLD UYGHUR COMBINING TWO DOTS ABOVE # Overriden, ccc controls order
 10F85         ; Bottom # Mn       OLD UYGHUR COMBINING TWO DOTS BELOW
 16F4F         ; Bottom # Mn       MIAO SIGN CONSONANT MODIFIER BAR
 16F51..16F87  ; Bottom # Mc  [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI

diff --git a/src/aat/feature_mappings.rs b/src/aat/feature_mappings.rs
@@ -45,7 +45,7 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
     FeatureMapping::new(b"frac", FeatureType::Fractions, DIAGONAL_FRACTIONS, NO_FRACTIONS),
     FeatureMapping::new(b"fwid", FeatureType::TextSpacing, MONOSPACED_TEXT, 7),
     FeatureMapping::new(b"halt", FeatureType::TextSpacing, ALT_HALF_WIDTH_TEXT, 7),
-    FeatureMapping::new(b"hist", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
+    FeatureMapping::new(b"hist", FeatureType::Dummy, 0, 1),
     FeatureMapping::new(b"hkna", FeatureType::AlternateKana, ALTERNATE_HORIZ_KANA_ON, ALTERNATE_HORIZ_KANA_OFF),
     FeatureMapping::new(b"hlig", FeatureType::Ligatures, HISTORICAL_LIGATURES_ON, HISTORICAL_LIGATURES_OFF),
     FeatureMapping::new(b"hngl", FeatureType::Transliteration, HANJA_TO_HANGUL, NO_TRANSLITERATION),
@@ -107,5 +107,6 @@ pub const FEATURE_MAPPINGS: &[FeatureMapping] = &[
     FeatureMapping::new(b"vkna", FeatureType::AlternateKana, ALTERNATE_VERT_KANA_ON, ALTERNATE_VERT_KANA_OFF),
     FeatureMapping::new(b"vpal", FeatureType::TextSpacing, ALT_PROPORTIONAL_TEXT, 7),
     FeatureMapping::new(b"vrt2", FeatureType::VerticalSubstitution, SUBSTITUTE_VERTICAL_FORMS_ON, SUBSTITUTE_VERTICAL_FORMS_OFF),
+    FeatureMapping::new(b"vrtr", FeatureType::VerticalSubstitution, 2, 3),
     FeatureMapping::new(b"zero", FeatureType::TypographicExtras, SLASHED_ZERO_ON, SLASHED_ZERO_OFF),
 ];
diff --git a/src/aat/map.rs b/src/aat/map.rs
@@ -27,6 +27,9 @@ pub enum FeatureType {
     ContextualAlternatives = 36,
     LowerCase = 37,
     UpperCase = 38,
+
+    // In harfbuzz, they just use the number 40 for "hist" but don't give it a name
+    Dummy = 40,
 }
 
 #[derive(Default)]

diff --git a/src/buffer.rs b/src/buffer.rs
@@ -84,6 +84,9 @@ pub mod glyph_flag {
     /// within text clusters.
     ///
     /// The UNSAFE_TO_BREAK flag will always imply this flag.
+    /// To use this flag, you must enable the buffer flag
+    ///	PRODUCE_UNSAFE_TO_CONCAT during shaping, otherwise
+    /// the buffer flag will not be reliably produced.
     pub const UNSAFE_TO_CONCAT: u32 = 0x00000002;
 
     /// All the currently defined flags.
@@ -1349,10 +1352,18 @@ impl Buffer {
     }
 
     pub fn unsafe_to_concat(&mut self, start: Option<usize>, end: Option<usize>) {
+        if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
+            return;
+        }
+
         self._set_glyph_flags(UNSAFE_TO_CONCAT, start, end, Some(true), None);
     }
 
     pub fn unsafe_to_break_from_outbuffer(&mut self, start: Option<usize>, end: Option<usize>) {
+        if !self.flags.contains(BufferFlags::PRODUCE_UNSAFE_TO_CONCAT) {
+            return;
+        }
+
         self._set_glyph_flags(
             UNSAFE_TO_BREAK | UNSAFE_TO_CONCAT,
             start,
@@ -1720,6 +1731,8 @@ bitflags::bitflags! {
         const DO_NOT_INSERT_DOTTED_CIRCLE   = 1 << 5;
         /// Indicates that the shape() call and its variants should perform various verification processes on the results of the shaping operation on the buffer. If the verification fails, then either a buffer message is sent, if a message handler is installed on the buffer, or a message is written to standard error. In either case, the shaping result might be modified to show the failed output.
         const VERIFY                        = 1 << 6;
+        /// Indicates that the `UNSAFE_TO_CONCAT` glyph-flag should be produced by the shaper. By default it will not be produced since it incurs a cost.
+        const PRODUCE_UNSAFE_TO_CONCAT      = 1 << 7;
     }
 }
 

diff --git a/src/common.rs b/src/common.rs
@@ -452,6 +452,8 @@ pub mod script {
     pub const TOTO: Script = Script::from_bytes(b"Toto");
     pub const VITHKUQI: Script = Script::from_bytes(b"Vith");
 
+    pub const SCRIPT_MATH: Script = Script::from_bytes(b"Zmth");
+
     // https://github.com/harfbuzz/harfbuzz/issues/1162
     pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");
 }

diff --git a/src/complex/arabic.rs b/src/complex/arabic.rs
@@ -599,6 +599,11 @@ const MODIFIER_COMBINING_MARKS: &[u32] = &[
     0x06E3, // ARABIC SMALL LOW SEEN
     0x06E7, // ARABIC SMALL HIGH YEH
     0x06E8, // ARABIC SMALL HIGH NOON
+    0x08CA, // ARABIC SMALL HIGH FARSI YEH
+    0x08CB, // ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW
+    0x08CD, // ARABIC SMALL HIGH ZAH
+    0x08CE, // ARABIC LARGE ROUND DOT ABOVE
+    0x08CF, // ARABIC LARGE ROUND DOT BELOW
     0x08D3, // ARABIC SMALL LOW WAW
     0x08F3, // ARABIC SMALL HIGH WAW
 ];

diff --git a/src/complex/universal.rs b/src/complex/universal.rs
@@ -48,7 +48,7 @@ pub mod category {
     pub const ZWNJ: u8 = 14; // Zero width non-joiner
 
     // pub const ZWJ: u8     = 15;   // Zero width joiner
-    // pub const WJ: u8      = 16;   // Word joiner
+    pub const WJ: u8 = 16; // Word joiner
 
     pub const RSV: u8 = 17; // Reserved characters
     pub const R: u8 = 18; // REPHA
@@ -296,8 +296,7 @@ fn setup_topographical_masks(plan: &ShapePlan, buffer: &mut Buffer) {
     let mut end = buffer.next_syllable(0);
     while start < buffer.len {
         let syllable = buffer.info[start].syllable() & 0x0F;
-        if syllable == SyllableType::SymbolCluster as u8
-            || syllable == SyllableType::HieroglyphCluster as u8
+        if syllable == SyllableType::HieroglyphCluster as u8
             || syllable == SyllableType::NonCluster as u8
         {
             last_form = None;
@@ -539,6 +538,6 @@ fn setup_masks(plan: &ShapePlan, _: &Face, buffer: &mut Buffer) {
     // We cannot setup masks here. We save information about characters
     // and setup masks later on in a pause-callback.
     for info in buffer.info_slice_mut() {
-        info.set_use_category(super::universal_table::get_category(info.glyph_id));
+        info.set_use_category(super::universal_table::get_category(info));
     }
 }