Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
[System::Runtime::InteropServices::Out] bool% isIndic,
[System::Runtime::InteropServices::Out] bool% isDigit,
[System::Runtime::InteropServices::Out] bool% isLatin,
[System::Runtime::InteropServices::Out] bool% isStrong
[System::Runtime::InteropServices::Out] bool% isStrong,
[System::Runtime::InteropServices::Out] bool% isExtended
);

/// <summary>
/// Check whether two Unicode scalar values belong to the same script.
/// This is used to determine if combining marks should stay with their base character
/// for font fallback purposes. (See PR #6857 / Issue #6801)
/// </summary>
bool IsSameScript(int unicodeScalar1, int unicodeScalar2);
};

}}}}//MS::Internal::Text::TextInterface

#endif //__ICLASSIFICATION_H
#endif //__ICLASSIFICATION_H
Original file line number Diff line number Diff line change
Expand Up @@ -155,54 +155,78 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
bool isStrong;
bool isExtended;

WCHAR ch = text[0];
classificationUtility->GetCharAttribute(
text[0],
ch,
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
isStrong
);

isExtended = ItemizerHelper::IsExtendedCharacter(text[0]);
isStrong,
isExtended
);

UINT32 isDigitRangeStart = 0;
UINT32 isDigitRangeEnd = 0;
bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
bool currentIsDigitValue;

// Track base character for combining mark script comparison (PR #6857 / Issue #6801)
// A combining mark should only stay with its base character if they have the same script.
int baseChar = isCombining ? -1 : ch;

// pCharAttribute is assumed to have the same length as text. This is enforced by Itemize().
pCharAttribute[0] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
| ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
| ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
| ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
| ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
| ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
| ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
| ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
| ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
| ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
| ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));

for (UINT32 i = 1; i < length; ++i)
{
ch = text[i];
classificationUtility->GetCharAttribute(
text[i],
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
isStrong
ch,
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
isStrong,
isExtended
);

isExtended = ItemizerHelper::IsExtendedCharacter(text[i]);

// For combining marks, check if they have the same script as the base character.
// If not, they should not be treated as combining with the base (PR #6857 / Issue #6801).
// However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.)
// are designed to work with any base character regardless of script, so skip the check
// for them to allow emoji sequences to stay together.
bool isCombiningWithBase = isCombining;
if (isCombining && baseChar >= 0 && !isExtended)
{
if (!classificationUtility->IsSameScript(baseChar, ch))
{
// Different script - this combining mark should not stay with the base character
isCombiningWithBase = false;
}
}

// Update base character tracking
if (!isCombining)
{
baseChar = ch;
}

pCharAttribute[i] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
| ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
| ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
| ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
| ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
| ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));
(((isCombiningWithBase) ? CharAttribute::IsCombining : CharAttribute::None)
| ((needsCaretInfo) ? CharAttribute::NeedsCaretInfo : CharAttribute::None)
| ((isLatin) ? CharAttribute::IsLatin : CharAttribute::None)
| ((isIndic) ? CharAttribute::IsIndic : CharAttribute::None)
| ((isStrong) ? CharAttribute::IsStrong : CharAttribute::None)
| ((isExtended) ? CharAttribute::IsExtended : CharAttribute::None));


currentIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public void GetCharAttribute(
out bool isIndic,
out bool isDigit,
out bool isLatin,
out bool isStrong
out bool isStrong,
out bool isExtended
)
{
CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
Expand All @@ -134,6 +135,16 @@ out bool isStrong
{
isIndic = IsScriptIndic(scriptId);
}

isExtended = Classification.IsScriptAgnosticCombining(unicodeScalar);
}

/// <summary>
/// Check whether two Unicode scalar values belong to the same script.
/// </summary>
public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
{
return Classification.IsSameScript(unicodeScalar1, unicodeScalar2);
}

/// <summary>
Expand All @@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId)
}
}
}

/// <summary>
/// Hold the classification table pointers.
/// </summary>
Expand Down Expand Up @@ -253,16 +265,73 @@ public static short GetUnicodeClass(int unicodeScalar)


/// <summary>
/// Lookup script ID for a Unicode scalar value
/// Check whether two Unicode scalar values belong to the same script
/// </summary>
public static ScriptID GetScript(int unicodeScalar)
static public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
{
unsafe
{
return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script;
short unicodeClass1 = GetUnicodeClass(unicodeScalar1);
short unicodeClass2 = GetUnicodeClass(unicodeScalar2);
if (unicodeClass1 != unicodeClass2)
{
CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1];
CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2];
if (a1.Script != a2.Script)
{
return false;
}
}

return true;
}
}

/// <summary>
/// Check whether the character is a script-agnostic combining mark that should
/// stay with its base character regardless of script differences.
/// </summary>
/// <remarks>
/// This includes variation selectors and combining enclosing marks used in emoji
/// sequences like "1️⃣" (digit + VS16 + combining enclosing keycap).
/// These characters are designed to modify any base character regardless of script.
/// </remarks>
static public bool IsScriptAgnosticCombining(int unicodeScalar)
{
// ZWJ - used in many emoji/grapheme clusters
if (unicodeScalar == 0x200D)
return true;

// Variation Selectors VS1-VS16 (U+FE00-U+FE0F)
if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F)
return true;

// Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF)
if (IsIVS(unicodeScalar))
return true;

// Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF)
return true;

// Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF)
return true;

// Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap
if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF)
return true;

// Combining Half Marks (U+FE20-U+FE2F)
if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F)
return true;

// Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF)
if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF)
return true;

return false;
}

/// <summary>
/// Compute Unicode scalar value from unicode codepoint stream
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

//
Expand Down Expand Up @@ -304,7 +304,11 @@ out sizeofChar
{
// continue to advance for combining mark with base char (can be precomposed by shaping engine)
// except if it is a different script (#6801)
if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar))
// However, script-agnostic combining marks (variation selectors, combining enclosing marks)
// should stay with their base character regardless of script, to allow emoji sequences
// like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together.
if (Classification.IsScriptAgnosticCombining(originalChar)
|| Classification.IsSameScript(baseChar, originalChar))
{
continue;
}
Expand Down Expand Up @@ -359,10 +363,13 @@ out sizeofChar
//
// The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base
// char in front.
// Script-agnostic combining marks (variation selectors, combining enclosing marks) should
// also stay with the base character regardless of script differences.
if (Classification.IsJoiner(ch)
|| (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar))
|| (baseChar != NOBASE && Classification.IsCombining(ch)
&& (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch)))
)
continue;
continue;

// If we have a glyph it's valid.
if (font.HasCharacter(checked((uint)ch)))
Expand Down
Loading