dotnet · etvorun · Jan 26, 2026 · Jan 27, 2026
@@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
             [System::Runtime::InteropServices::Out] bool% isIndic,
             [System::Runtime::InteropServices::Out] bool% isDigit,
             [System::Runtime::InteropServices::Out] bool% isLatin,
-            [System::Runtime::InteropServices::Out] bool% isStrong
+            [System::Runtime::InteropServices::Out] bool% isStrong,
+            [System::Runtime::InteropServices::Out] bool% isExtended
             );
+
+        /// <summary>
+        /// Check whether two Unicode scalar values belong to the same script.
+        /// This is used to determine if combining marks should stay with their base character
+        /// for font fallback purposes. (See PR #6857 / Issue #6801)
+        /// </summary>
+        bool IsSameScript(int unicodeScalar1, int unicodeScalar2);
     };
 
 }}}}//MS::Internal::Text::TextInterface
 
-#endif //__ICLASSIFICATION_H
+#endif //__ICLASSIFICATION_H
@@ -155,54 +155,78 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
         bool isStrong;
         bool isExtended;
 
+        WCHAR ch = text[0];
         classificationUtility->GetCharAttribute(
-            text[0],
+            ch,
             isCombining,
             needsCaretInfo,
             isIndic,
             isDigit,
             isLatin,
-            isStrong
-            );
-
-        isExtended = ItemizerHelper::IsExtendedCharacter(text[0]);
+            isStrong,
+            isExtended
+        );
 
         UINT32 isDigitRangeStart = 0;
         UINT32 isDigitRangeEnd = 0;
         bool   previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
         bool   currentIsDigitValue;
 
+        // Track base character for combining mark script comparison (PR #6857 / Issue #6801)
+        // A combining mark should only stay with its base character if they have the same script.
+        int baseChar = isCombining ? -1 : ch;
+
         // pCharAttribute is assumed to have the same length as text. This is enforced by Itemize().
         pCharAttribute[0] = (CharAttributeType)
-                            (((isCombining)    ? CharAttribute::IsCombining    : CharAttribute::None)
-                           | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
-                           | ((isLatin)        ? CharAttribute::IsLatin        : CharAttribute::None)
-                           | ((isIndic)        ? CharAttribute::IsIndic        : CharAttribute::None)
-                           | ((isStrong)       ? CharAttribute::IsStrong       : CharAttribute::None)
-                           | ((isExtended)     ? CharAttribute::IsExtended     : CharAttribute::None));
+            (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
+                | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
+                | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
+                | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
+                | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
+                | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));
 
         for (UINT32 i = 1; i < length; ++i)
         {
+            ch = text[i];
             classificationUtility->GetCharAttribute(
-            text[i],
-            isCombining,
-            needsCaretInfo,
-            isIndic,
-            isDigit,
-            isLatin,
-            isStrong
+                ch,
+                isCombining,
+                needsCaretInfo,
+                isIndic,
+                isDigit,
+                isLatin,
+                isStrong,
+                isExtended
             );
 
-            isExtended = ItemizerHelper::IsExtendedCharacter(text[i]);
-
+            // For combining marks, check if they have the same script as the base character.
+            // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801).
+            // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.)
+            // are designed to work with any base character regardless of script, so skip the check
+            // for them to allow emoji sequences to stay together.
+            bool isCombiningWithBase = isCombining;
+            if (isCombining && baseChar >= 0 && !isExtended)
+            {
+                if (!classificationUtility->IsSameScript(baseChar, ch))
+                {
+                    // Different script - this combining mark should not stay with the base character
+                    isCombiningWithBase = false;
+                }
+            }
+
+            // Update base character tracking
+            if (!isCombining)
+            {
+                baseChar = ch;
+            }
 
             pCharAttribute[i] = (CharAttributeType)
-                                (((isCombining)    ? CharAttribute::IsCombining    : CharAttribute::None)
-                               | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
-                               | ((isLatin)        ? CharAttribute::IsLatin        : CharAttribute::None)
-                               | ((isIndic)        ? CharAttribute::IsIndic        : CharAttribute::None)
-                               | ((isStrong)       ? CharAttribute::IsStrong       : CharAttribute::None)
-                               | ((isExtended)     ? CharAttribute::IsExtended     : CharAttribute::None));
+                (((isCombiningWithBase) ? CharAttribute::IsCombining : CharAttribute::None)
+                    | ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
+                    | ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
+                    | ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
+                    | ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
+                    | ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));
 
 
             currentIsDigitValue = (numberCulture == nullptr) ? false : isDigit;

@@ -108,7 +108,8 @@ public void GetCharAttribute(
                                     out bool isIndic,
                                     out bool isDigit,
                                     out bool isLatin,
-                                    out bool isStrong
+                                    out bool isStrong,
+                                    out bool isExtended
                                     )
         {
             CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
@@ -134,6 +135,16 @@ out bool isStrong
             {
                 isIndic = IsScriptIndic(scriptId);
             }
+
+            isExtended = Classification.IsScriptAgnosticCombining(unicodeScalar);
+        }
+
+        /// <summary>
+        /// Check whether two Unicode scalar values belong to the same script.
+        /// </summary>
+        public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
+        {
+            return Classification.IsSameScript(unicodeScalar1, unicodeScalar2);
         }
 
         /// <summary>
@@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId)
             }
         }
     }
+
     /// <summary>
     /// Hold the classification table pointers. 
     /// </summary>    
@@ -253,16 +265,73 @@ public static short GetUnicodeClass(int unicodeScalar)
 
 
         /// <summary>
-        /// Lookup script ID for a Unicode scalar value
+        /// Check whether two Unicode scalar values belong to the same script
         /// </summary>
-        public static ScriptID GetScript(int unicodeScalar)
+        static public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
         {
             unsafe
             {
-                return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script;
+                short unicodeClass1 = GetUnicodeClass(unicodeScalar1);
+                short unicodeClass2 = GetUnicodeClass(unicodeScalar2);
+                if (unicodeClass1 != unicodeClass2)
+                {
+                    CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1];
+                    CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2];
+                    if (a1.Script != a2.Script)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
             }
         }
 
+        /// <summary>
+        /// Check whether the character is a script-agnostic combining mark that should
+        /// stay with its base character regardless of script differences.
+        /// </summary>
+        /// <remarks>
+        /// This includes variation selectors and combining enclosing marks used in emoji
+        /// sequences like "1️⃣" (digit + VS16 + combining enclosing keycap).
+        /// These characters are designed to modify any base character regardless of script.
+        /// </remarks>
+        static public bool IsScriptAgnosticCombining(int unicodeScalar)
+        {
+            // ZWJ - used in many emoji/grapheme clusters
+            if (unicodeScalar == 0x200D)
+                return true;
+
+            // Variation Selectors VS1-VS16 (U+FE00-U+FE0F)
+            if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F)
+                return true;
+
+            // Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF)
+            if (IsIVS(unicodeScalar))
+                return true;
+
+            // Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
+            if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF)
+                return true;
+
+            // Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
+            if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF)
+                return true;
+
+            // Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap
+            if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF)
+                return true;
+
+            // Combining Half Marks (U+FE20-U+FE2F)
+            if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F)
+                return true;
+
+            // Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF)
+            if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF)
+                return true;
+
+            return false;
+        }
 
         /// <summary>
         /// Compute Unicode scalar value from unicode codepoint stream

@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
 //
@@ -304,7 +304,11 @@ out sizeofChar
                 {
                     // continue to advance for combining mark with base char (can be precomposed by shaping engine)
                     // except if it is a different script (#6801)
-                    if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar))
+                    // However, script-agnostic combining marks (variation selectors, combining enclosing marks)
+                    // should stay with their base character regardless of script, to allow emoji sequences
+                    // like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together.
+                    if (Classification.IsScriptAgnosticCombining(originalChar)
+                        || Classification.IsSameScript(baseChar, originalChar))
                     {
                         continue;
                     }
@@ -359,10 +363,13 @@ out sizeofChar
                     //
                     // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base
                     // char in front.
+                    // Script-agnostic combining marks (variation selectors, combining enclosing marks) should
+                    // also stay with the base character regardless of script differences.
                     if (Classification.IsJoiner(ch)
-                       || (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar))
+                       || (baseChar != NOBASE && Classification.IsCombining(ch)
+                           && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch)))
                        )
-                       continue;
+                        continue;
 
                     // If we have a glyph it's valid.
                     if (font.HasCharacter(checked((uint)ch)))