diff --git a/Jint/Native/Global/GlobalObject.Properties.cs b/Jint/Native/Global/GlobalObject.Properties.cs index 575aef1ba..5784f8848 100644 --- a/Jint/Native/Global/GlobalObject.Properties.cs +++ b/Jint/Native/Global/GlobalObject.Properties.cs @@ -30,7 +30,7 @@ public partial class GlobalObject private static readonly Key propertyInt16Array = "Int16Array"; private static readonly Key propertyInt32Array = "Int32Array"; private static readonly Key propertyInt8Array = "Int8Array"; - //private static readonly Key propertyIntl = "Intl"; + private static readonly Key propertyIntl = "Intl"; private static readonly Key propertyJSON = "JSON"; private static readonly Key propertyMap = "Map"; private static readonly Key propertyMath = "Math"; @@ -109,7 +109,7 @@ protected override void Initialize() properties.AddDangerous(propertyInt16Array, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Int16Array, PropertyFlags)); properties.AddDangerous(propertyInt32Array, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Int32Array, PropertyFlags)); properties.AddDangerous(propertyInt8Array, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Int8Array, PropertyFlags)); - // TODO properties.AddDapropertygerous(propertyIntl, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Intl, propertyFlags)); + properties.AddDangerous(propertyIntl, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Intl, PropertyFlags)); properties.AddDangerous(propertyJSON, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Json, PropertyFlags)); properties.AddDangerous(propertyMap, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Map, PropertyFlags)); properties.AddDangerous(propertyMath, new LazyPropertyDescriptor(this, static global => global._realm.Intrinsics.Math, PropertyFlags)); diff --git a/Jint/Native/Intl/Icu.cs b/Jint/Native/Intl/Icu.cs new file mode 100644 index 000000000..07325bb98 --- /dev/null +++ b/Jint/Native/Intl/Icu.cs @@ -0,0 +1,103 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +namespace Jint.Native.Intl; + +// ICU (International Components for Unicode) is a native C/C++ library provided by the OS +// that implements BCP-47 locale canonicalization, alias resolution, and other i18n data. +// https://github.com/unicode-org/icu +// We use DllImport to bind directly to its functions (e.g. uloc_toLanguageTag) so we can +// reuse the OS-provided ICU implementation instead of reimplementing the spec in C#. +internal static class ICU +{ + private const string MacLib = "/usr/lib/libicucore.dylib"; + private const string LinuxUc = "icuuc"; // resolves to libicuuc.so[.N] + private const string LinuxI18n = "icui18n"; // resolves to libicui18n.so[.N] + +#if OSX || MACCATALYST || IOS || TVOS +private const string UcLib = MacLib; +private const string I18nLib = MacLib; +#elif LINUX +private const string UcLib = LinuxUc; +private const string I18nLib = LinuxI18n; +#else + // Windows: prefer bundling (put icuucNN.dll/icuinNN.dll next to your .exe) + private const string UcLib = "icuuc"; // icuucNN.dll via loader search path + private const string I18nLib = "icuin"; // icuinNN.dll +#endif + + // ICU error code enum (partial) + public enum UErrorCode : int + { + U_ZERO_ERROR = 0, + U_ILLEGAL_ARGUMENT_ERROR = 1, + // ... add more as needed + } + + [DllImport(UcLib, CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi)] + public static extern int uloc_countAvailable(); + + [DllImport(UcLib, CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi)] + public static extern IntPtr uloc_getAvailable(int n); + + // Example for something in i18n (collation) + [DllImport(I18nLib, CallingConvention = CallingConvention.Cdecl, CharSet = CharSet.Ansi)] + public static extern int ucol_countAvailable(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static string PtrToAnsiString(IntPtr p) => Marshal.PtrToStringAnsi(p)!; + + [DllImport(UcLib, CallingConvention = CallingConvention.Cdecl, EntryPoint = "uloc_toLanguageTag")] + private static extern unsafe int uloc_toLanguageTag_ptr( + byte* localeIdUtf8, // const char* (UTF-8) + byte[] langtag, // UTF-8 out + int langtagCapacity, + [MarshalAs(UnmanagedType.I1)] bool strict, + ref UErrorCode err); + + public static unsafe int uloc_toLanguageTag( + string localeId, byte[] langtag, int langtagCapacity, bool strict, ref UErrorCode err) + { + // NUL-terminate for C + byte[] inBytes = Encoding.UTF8.GetBytes(localeId + "\0"); + fixed (byte* p = inBytes) + { + return uloc_toLanguageTag_ptr(p, langtag, langtagCapacity, strict, ref err); + } + } + + [DllImport(UcLib, CallingConvention = CallingConvention.Cdecl, EntryPoint = "uloc_forLanguageTag")] + private static extern unsafe int uloc_forLanguageTag_ptr( + byte* langtagUtf8, // const char* (UTF-8) + byte[] localeId, // UTF-8 out + int localeIdCapacity, + out int parsedLength, + ref UErrorCode err); + + public static unsafe int uloc_forLanguageTag(string langtag, byte[] localeId, int localeIdCapacity, out int parsedLength, ref UErrorCode err) + { + var inBytes = System.Text.Encoding.UTF8.GetBytes(langtag + "\0"); + fixed (byte* p = inBytes) + { + return uloc_forLanguageTag_ptr(p, localeId, localeIdCapacity, out parsedLength, ref err); + } + } + + [DllImport(UcLib, CallingConvention = CallingConvention.Cdecl, EntryPoint = "uloc_canonicalize")] + private static extern unsafe int uloc_canonicalize_ptr( + byte* localeIdUtf8, // const char* (UTF-8, NUL-terminated) + byte[] name, // out buffer (UTF-8, no trailing NUL guaranteed) + int nameCapacity, + ref UErrorCode err); + + public static unsafe int uloc_canonicalize(string localeId, byte[] name, int nameCapacity, ref UErrorCode err) + { + // NUL-terminate input for C + var inBytes = Encoding.UTF8.GetBytes(localeId + "\0"); + fixed (byte* p = inBytes) + { + return uloc_canonicalize_ptr(p, name, nameCapacity, ref err); + } + } +} + diff --git a/Jint/Native/Intl/IcuHelpers.cs b/Jint/Native/Intl/IcuHelpers.cs new file mode 100644 index 000000000..16450f3ee --- /dev/null +++ b/Jint/Native/Intl/IcuHelpers.cs @@ -0,0 +1,288 @@ +using System.Text; +using Jint.Runtime; + +namespace Jint.Native.Intl +{ + /// + /// ICU interop + ECMA-402 canonicalization helpers shared by Intl built-ins. + /// + internal static class IcuHelpers + { + /// + /// Mirrors WebKit's canonicalizeUnicodeExtensionsAfterICULocaleCanonicalization(): + /// - Finds the "-u-" extension and its end (before the next singleton). + /// - Re-emits the extension with per-key normalization: + /// * For keys kb/kc/kh/kk/kn: drop boolean "true" (and treat "yes" as true → drop). + /// * For all other keys: keep "yes"; if ICU turned "yes" into "true", revert to "yes". + /// * For "rg"/"sd": canonicalize subdivision aliases (no23→no50, ...). + /// * For "tz": canonicalize timezone aliases (eire→iedub, est→papty, ...). + /// Everything else in the tag is preserved. + /// + public static string CanonicalizeUnicodeExtensionsAfterIcu(string tag) + { + if (string.IsNullOrEmpty(tag)) + return tag; + + int extensionIndex = tag.IndexOf("-u-", StringComparison.OrdinalIgnoreCase); + if (extensionIndex < 0) + return tag; + + // Determine the end of the -u- block (before the next singleton like -x-). + int extensionLength = tag.Length - extensionIndex; + int end = extensionIndex + 3; + while (end < tag.Length) + { + int dash = tag.IndexOf('-', end); + if (dash < 0) + break; + if (dash + 2 < tag.Length && tag[dash + 2] == '-') + { + extensionLength = dash - extensionIndex; + break; + } + end = dash + 1; + } + + var result = new StringBuilder(tag.Length + 8); + + // Copy up to and including "-u" + result.Append(tag, 0, extensionIndex + 2); + + // Process "-u-..." segment + string extension = tag.Substring(extensionIndex, extensionLength); + var parts = extension.Split('-'); // parts[0] == "", parts[1] == "u" + int i = 2; + + while (i < parts.Length) + { + string subtag = parts[i]; + if (subtag.Length == 0) { i++; continue; } + + // Emit the key or attribute + result.Append('-'); + result.Append(subtag); + + if (subtag.Length == 2) + { + // It's a key. + string key = subtag; + bool keyIsDroppableTrue = s_trueDroppableKeys.Contains(key); + + int valueStart = i + 1; + int valueEnd = valueStart; + while (valueEnd < parts.Length && parts[valueEnd].Length != 2 && parts[valueEnd].Length != 0) + valueEnd++; + + bool emittedAnyValue = false; + + for (int v = valueStart; v < valueEnd; v++) + { + string value = parts[v]; + if (value.Length == 0) + continue; + + // Handle "yes"/"true" normalization + if (value.Equals("yes", StringComparison.OrdinalIgnoreCase)) + { + if (keyIsDroppableTrue) + { + // Drop boolean true for droppable keys. + continue; + } + // keep "yes" for non-droppable + } + else if (value.Equals("true", StringComparison.OrdinalIgnoreCase)) + { + if (keyIsDroppableTrue) + { + // Drop boolean true for droppable keys. + continue; + } + // Non-droppable: canonicalize to "yes" + value = "yes"; + } + + // Per-key aliasing + if (key.Equals("rg", StringComparison.OrdinalIgnoreCase) || + key.Equals("sd", StringComparison.OrdinalIgnoreCase)) + { + value = CanonicalizeSubdivision(value); + } + else if (key.Equals("tz", StringComparison.OrdinalIgnoreCase)) + { + value = CanonicalizeTimeZoneType(value); + } + + result.Append('-'); + result.Append(value); + emittedAnyValue = true; + } + + // If **no** value was emitted for a **non-droppable** key, synthesize "-yes". + if (!emittedAnyValue && !keyIsDroppableTrue) + { + result.Append("-yes"); + } + + i = valueEnd; + } + else + { + // Attribute (or malformed); just pass through. + i++; + } + } + + // Append remainder after the -u- block + result.Append(tag, extensionIndex + extensionLength, tag.Length - (extensionIndex + extensionLength)); + return result.ToString(); + } + + /// Validates `tag` as a BCP-47 language tag via ICU and returns a canonical tag. + /// Throws RangeError on invalid tags (spec-compliant). + public static string CanonicalizeUnicodeLocaleIdOrThrow(Realm realm, string tag) + { + // 1) Validate & parse BCP-47 -> ICU locale ID + var status = ICU.UErrorCode.U_ZERO_ERROR; + byte[] locBuf = new byte[128]; + int parsed; + int need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status); + + if (need > locBuf.Length) + { + locBuf = new byte[need]; + status = ICU.UErrorCode.U_ZERO_ERROR; + need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status); + } + + if (status != ICU.UErrorCode.U_ZERO_ERROR || parsed != tag.Length || need <= 0) + { + // RangeError per spec + Throw.RangeError(realm, $"invalid language tag: {tag}"); + } + + string icuLocaleId = Encoding.UTF8.GetString(locBuf, 0, need); + + // 2) Canonicalize the ICU locale ID (this applies CLDR language/region/script aliases, e.g. cmn->zh) + status = ICU.UErrorCode.U_ZERO_ERROR; + byte[] canonLoc = new byte[System.Math.Max(need + 16, 256)]; + int canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status); + + if (canonLen > canonLoc.Length) + { + canonLoc = new byte[canonLen]; + status = ICU.UErrorCode.U_ZERO_ERROR; + canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status); + } + + string icuCanonical = (status == ICU.UErrorCode.U_ZERO_ERROR && canonLen > 0) + ? Encoding.UTF8.GetString(canonLoc, 0, canonLen) + : icuLocaleId; // fall back if canonicalize didn’t change it + + // 3) Convert canonical ICU locale ID -> canonical BCP-47 tag + status = ICU.UErrorCode.U_ZERO_ERROR; + byte[] outBuf = new byte[256]; + int len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status); + + if (len > outBuf.Length) + { + outBuf = new byte[len]; + status = ICU.UErrorCode.U_ZERO_ERROR; + len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status); + } + + if (status != ICU.UErrorCode.U_ZERO_ERROR || len <= 0) + { + Throw.RangeError(realm, $"failed to canonicalize language tag: {tag}"); + } + + var canonical = Encoding.UTF8.GetString(outBuf, 0, len); + + // WebKit-style cleanup for "-u-…-true" + canonical = CanonicalizeUnicodeExtensionsAfterIcu(canonical); + + // Fallback for ICU builds that don't alias cmn->zh + canonical = FixKnownLanguageAliases(canonical); + + return canonical; + } + + // Keys whose boolean "true" value is **elided** in canonical form. + // For these, "-u--yes" and "-u--true" both canonicalize to just "-u-". + // Add "ca" here so a bare `-u-ca` does not synthesize `-yes` + private static readonly HashSet s_trueDroppableKeys = new(StringComparer.OrdinalIgnoreCase) + { + "kb", "kc", "kh", "kk", "kn", "ca" + }; + + // Canonicalize subdivision aliases (used for rg/sd values). + private static string CanonicalizeSubdivision(string value) + { + switch (value.ToLowerInvariant()) + { + case "no23": return "no50"; + case "cn11": return "cnbj"; + case "cz10a": return "cz110"; + case "fra": return "frges"; + case "frg": return "frges"; + case "lud": return "lucl"; // test262 prefers the first in replacement list + default: return value; + } + } + + // Canonicalize time zone type aliases (used for tz values). + private static string CanonicalizeTimeZoneType(string value) + { + switch (value.ToLowerInvariant()) + { + case "cnckg": return "cnsha"; // deprecated -> preferred + case "eire": return "iedub"; // alias -> canonical + case "est": return "papty"; // alias -> canonical + case "gmt0": return "gmt"; // alias -> canonical + case "uct": return "utc"; // alias -> canonical + case "zulu": return "utc"; // alias -> canonical + case "utcw05": return "papty"; // short offset alias seen in test262 + default: return value; + } + } + + private static string FixKnownLanguageAliases(string canonicalTag) + { + if (string.IsNullOrEmpty(canonicalTag)) + return canonicalTag; + + // Split once: "xx[-…]" → lang + rest (rest includes the leading '-') + int dash = canonicalTag.IndexOf('-'); + ReadOnlySpan lang = dash < 0 + ? canonicalTag.AsSpan() + : canonicalTag.AsSpan(0, dash); + + // We'll append the remainder (if any) after we swap the primary language subtag. + ReadOnlySpan rest = dash < 0 + ? ReadOnlySpan.Empty + : canonicalTag.AsSpan(dash); // includes '-...' + + // Known primary language aliases not consistently handled by older ICU: + // - cmn → zh (Mandarin → Chinese) + // - ji → yi + // - in → id + if (lang.Equals("cmn".AsSpan(), StringComparison.OrdinalIgnoreCase)) + { + return rest.IsEmpty ? "zh" : "zh" + rest.ToString(); + } + + if (lang.Equals("ji".AsSpan(), StringComparison.OrdinalIgnoreCase)) + { + return rest.IsEmpty ? "yi" : "yi" + rest.ToString(); + } + + if (lang.Equals("in".AsSpan(), StringComparison.OrdinalIgnoreCase)) + { + return rest.IsEmpty ? "id" : "id" + rest.ToString(); + } + + // Otherwise, leave as-is. + return canonicalTag; + } + } +} diff --git a/Jint/Native/Intl/IntlInstance.cs b/Jint/Native/Intl/IntlInstance.cs index d8f60823a..f2a0216d6 100644 --- a/Jint/Native/Intl/IntlInstance.cs +++ b/Jint/Native/Intl/IntlInstance.cs @@ -38,7 +38,7 @@ protected override void Initialize() ["PluralRules"] = new(_realm.Intrinsics.PluralRules, false, false, true), ["RelativeTimeFormat"] = new(_realm.Intrinsics.RelativeTimeFormat, false, false, true), ["Segmenter"] = new(_realm.Intrinsics.Segmenter, false, false, true), - ["getCanonicalLocales "] = new(new ClrFunction(Engine, "getCanonicalLocales ", GetCanonicalLocales, 1, PropertyFlag.Configurable), true, false, true), + ["getCanonicalLocales"] = new(new ClrFunction(Engine, "getCanonicalLocales", GetCanonicalLocales, 1, PropertyFlag.Configurable), true, false, true), }; SetProperties(properties); @@ -49,8 +49,91 @@ protected override void Initialize() SetSymbols(symbols); } + // CanonicalizeLocaleList(locales) + // Spec: https://tc39.es/ecma402/#sec-canonicalizelocalelist + // Behavior mirrors WebKit’s implementation: if `locales` is undefined -> empty list. + // Otherwise iterate array-like `O`, accept String or Intl.Locale objects, validate + // BCP-47 via ICU (uloc_forLanguageTag), then canonicalize (uloc_toLanguageTag). + private List CanonicalizeLocaleList(JsValue locales) + { + var seen = new List(); + + // 1. If locales is undefined, return empty list + if (locales.IsUndefined()) + return seen; + + // 3–4. Build O: + // If locales is a String or an Intl.Locale, behave like [ locales ]. + // We special-case String; for Intl.Locale we don’t have the brand slot here, + // so we treat other objects via ToObject (spec 4). + bool treatAsSingle = locales.IsString(); + + ObjectInstance O; + if (treatAsSingle) + { + var arr = _realm.Intrinsics.Array.Construct(Arguments.Empty); + arr.SetIndexValue(0, locales, updateLength: true); + O = arr; + } + else + { + O = TypeConverter.ToObject(_realm, locales); + } + + // 5. Let len be LengthOfArrayLike(O) + var lenValue = O.Get("length"); + var len = TypeConverter.ToLength(lenValue); + + var dedupe = new HashSet(System.StringComparer.Ordinal); + + // 6–7. Iterate k + for (ulong k = 0; k < len; k++) + { + var pk = TypeConverter.ToString(k); // ToString(𝔽(k)) + bool kPresent = O.HasProperty(pk); + if (!kPresent) + continue; + + var kValue = O.Get(pk); + + // 7.c.ii: must be String or Object + if (!kValue.IsString() && !kValue.IsObject()) + { + Throw.TypeError(_realm, "locale value must be a string or object"); + } + + // 7.c.iii/iv: tag from Locale.[[Locale]] or ToString(kValue) + // We don’t have direct [[InitializedLocale]] plumbing here, so use ToString unless it’s a JS string. + string tag = kValue.IsString() + ? kValue.AsString().ToString() + : TypeConverter.ToString(kValue); + + // 7.c.v–vii: Validate & canonicalize; throw RangeError if invalid + string canonical = IcuHelpers.CanonicalizeUnicodeLocaleIdOrThrow(_realm, tag); + + if (dedupe.Add(canonical)) + seen.Add(canonical); + } + + // 8. Return seen + return seen; + } + + // Intl.getCanonicalLocales(locales) + // https://tc39.es/ecma402/#sec-intl.getcanonicallocales private JsValue GetCanonicalLocales(JsValue thisObject, JsCallArguments arguments) { - return new JsArray(_engine); + var locales = arguments.At(0); + var list = CanonicalizeLocaleList(locales); + + var arr = new JsArray(_engine); + arr.Prototype = _realm.Intrinsics.Array.PrototypeObject; + + for (uint i = 0; i < list.Count; i++) + { + arr.SetIndexValue(i, list[(int) i], updateLength: true); + } + + return arr; } }