Skip to content

Commit b37896e

Browse files
authored
Feature | Use hardcoded LCID mappings when decoding strings (dotnet#4212)
* Use hardcoded LCID mappings when decoding strings * Reformat LCID/codepage mapping table, adding LCID descriptions * Correct debug assertions, remove unused using * Add unit tests for LocalesHelper
1 parent 57b6581 commit b37896e

4 files changed

Lines changed: 319 additions & 344 deletions

File tree

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Diagnostics;
7+
8+
#nullable enable
9+
10+
namespace Microsoft.Data.SqlClient;
11+
12+
internal static class LocalesHelper
13+
{
14+
private const int LocaleMappingCount = 207;
15+
16+
/// <summary>
17+
/// Array copied directly from tdssort.h from luxor.
18+
/// Use the sort ID as an index into this array to retrieve the code page.
19+
/// If the value is zero, the index is not a valid sort ID.
20+
/// </summary>
21+
private static ReadOnlySpan<ushort> SortIdToCodePageMappings => [
22+
// 0-29: reserved
23+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24+
// 30-35
25+
437, 437, 437, 437, 437, 437,
26+
// 36-39: reserved
27+
0, 0, 0, 0,
28+
// 40-45
29+
850, 850, 850, 850, 850, 850,
30+
// 46-48: reserved
31+
0, 0, 0,
32+
// 49-61
33+
850, 1252, 1252, 1252, 1252, 1252, 850, 850, 850, 850, 850, 850, 850,
34+
// 62-70: reserved
35+
0, 0, 0, 0, 0, 0, 0, 0, 0,
36+
// 71-75
37+
1252, 1252, 1252, 1252, 1252,
38+
// 76-79: reserved
39+
0, 0, 0, 0,
40+
// 80-98
41+
1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 1250,
42+
// 99-103: reserved
43+
0, 0, 0, 0, 0,
44+
// 104-108
45+
1251, 1251, 1251, 1251, 1251,
46+
// 109-111: reserved
47+
0, 0, 0,
48+
// 112-114
49+
1253, 1253, 1253,
50+
// 115-119: reserved
51+
0, 0, 0, 0, 0,
52+
// 120-122
53+
1253, 1253, 1253,
54+
// 123: reserved
55+
0,
56+
// 124
57+
1253,
58+
// 125-127: reserved
59+
0, 0, 0,
60+
// 128-130
61+
1254, 1254, 1254,
62+
// 131-135: reserved
63+
0, 0, 0, 0, 0,
64+
// 136-138
65+
1255, 1255, 1255,
66+
// 139-143: reserved
67+
0, 0, 0, 0, 0,
68+
// 144-146
69+
1256, 1256, 1256,
70+
// 147-151: reserved
71+
0, 0, 0, 0, 0,
72+
// 152-160
73+
1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257, 1257,
74+
// 161-182: reserved
75+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76+
// 183-186
77+
1252, 1252, 1252, 1252,
78+
// 187-191: reserved
79+
0, 0, 0, 0, 0,
80+
// 192-206
81+
932, 932, 949, 949, 950, 950, 936, 936, 932, 949, 950, 936, 874, 874, 874,
82+
// 207-209: reserved
83+
0, 0, 0,
84+
// 210-217
85+
1252, 1252, 1252, 1252, 1252, 1252, 1252, 1252,
86+
// 218-255
87+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
88+
];
89+
90+
/// <summary>
91+
/// Maps LCIDs to code pages. Each pair of values in the array represents an LCID and its corresponding code page.
92+
/// This means that even indexes in the array are LCIDs, and the following odd index is the code page for that LCID.
93+
/// </summary>
94+
/// <remarks>
95+
/// This is aligned with the mssql-jdbc driver. See the following file for the original data:
96+
/// https://github.com/microsoft/mssql-jdbc/blob/23e935b846af5f112f445289e89d54a379e8b8ed/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java#L141-L361
97+
/// </remarks>
98+
private static ReadOnlySpan<int> LcidToCodePageMappings => [
99+
// ar_SA bg_BG ca_ES zh_TW cs_CZ da_DK de_DE el_GR
100+
0x0401, 1256, 0x0402, 1251, 0x0403, 1252, 0x0404, 950, 0x0405, 1250, 0x0406, 1252, 0x0407, 1252, 0x0408, 1253,
101+
102+
// en_US es_ES_tradnl fi_FI fr_FR he_IL hu_HU is_IS it_IT
103+
0x0409, 1252, 0x040a, 1252, 0x040b, 1252, 0x040c, 1252, 0x040d, 1255, 0x040e, 1250, 0x040f, 1252, 0x0410, 1252,
104+
105+
// ja_JP ko_KR nl_NL nb_NO pl_PL pt_BR rm_CH ro_RO
106+
0x0411, 932, 0x0412, 949, 0x0413, 1252, 0x0414, 1252, 0x0415, 1250, 0x0416, 1252, 0x0417, 1252, 0x0418, 1250,
107+
108+
// ru_RU hr_HR sk_SK sq_AL sv_SE th_TH tr_TR ur_PK
109+
0x0419, 1251, 0x041a, 1250, 0x041b, 1250, 0x041c, 1250, 0x041d, 1252, 0x041e, 874, 0x041f, 1254, 0x0420, 1256,
110+
111+
// id_ID uk_UA be_BY sl_SI et_EE lv_LV lt_LT tg_Cyrl_TJ
112+
0x0421, 1252, 0x0422, 1251, 0x0423, 1251, 0x0424, 1250, 0x0425, 1257, 0x0426, 1257, 0x0427, 1257, 0x0428, 1251,
113+
114+
// fa_IR vi_VN hy_AM az_Latn_AZ eu_ES wen_DE mk_MK tn_ZA
115+
0x0429, 1256, 0x042a, 1258, 0x042b, 1252, 0x042c, 1254, 0x042d, 1252, 0x042e, 1252, 0x042f, 1251, 0x0432, 1252,
116+
117+
// xh_ZA zu_ZA af_ZA ka_GE fo_FO hi_IN mt_MT se_NO
118+
0x0434, 1252, 0x0435, 1252, 0x0436, 1252, 0x0437, 1252, 0x0438, 1252, 0x0439, 1200, 0x043a, 1200, 0x043b, 1252,
119+
120+
// ms_MY kk_KZ ky_KG sw_KE tk_TM uz_Latn_UZ tt_RU bn_IN
121+
0x043e, 1252, 0x043f, 1251, 0x0440, 1251, 0x0441, 1252, 0x0442, 1250, 0x0443, 1254, 0x0444, 1251, 0x0445, 1200,
122+
123+
// pa_IN gu_IN or_IN ta_IN te_IN kn_IN ml_IN as_IN
124+
0x0446, 1200, 0x0447, 1200, 0x0448, 1200, 0x0449, 1200, 0x044a, 1200, 0x044b, 1200, 0x044c, 1200, 0x044d, 1200,
125+
126+
// mr_IN sa_IN mn_MN bo_CN cy_GB km_KH lo_LA gl_ES
127+
0x044e, 1200, 0x044f, 1200, 0x0450, 1251, 0x0451, 1200, 0x0452, 1252, 0x0453, 1200, 0x0454, 1200, 0x0456, 1252,
128+
129+
// kok_IN syr_SY si_LK iu_Cans_CA am_ET ne_NP fy_NL ps_AF
130+
0x0457, 1200, 0x045a, 1200, 0x045b, 1200, 0x045d, 1252, 0x045e, 1252, 0x0461, 1200, 0x0462, 1252, 0x0463, 1200,
131+
132+
// fil_PH dv_MV ha_Latn_NG yo_NG quz_BO nso_ZA ba_RU lb_LU
133+
0x0464, 1252, 0x0465, 1200, 0x0468, 1252, 0x046a, 1252, 0x046b, 1252, 0x046c, 1252, 0x046d, 1251, 0x046e, 1252,
134+
135+
// kl_GL ig_NG ii_CN arn_CL moh_CA br_FR ug_CN mi_NZ
136+
0x046f, 1252, 0x0470, 1252, 0x0478, 1252, 0x047a, 1252, 0x047c, 1252, 0x047e, 1252, 0x0480, 1256, 0x0481, 1200,
137+
138+
// oc_FR co_FR gsw_FR sah_RU qut_GT rw_RW wo_SN prs_AF
139+
0x0482, 1252, 0x0483, 1252, 0x0484, 1252, 0x0485, 1251, 0x0486, 1252, 0x0487, 1252, 0x0488, 1252, 0x048c, 1256,
140+
141+
// ar_IQ zh_CN de_CH en_GB es_MX fr_BE it_CH nl_BE
142+
0x0801, 1256, 0x0804, 936, 0x0807, 1252, 0x0809, 1252, 0x080a, 1252, 0x080c, 1252, 0x0810, 1252, 0x0813, 1252,
143+
144+
// nn_NO pt_PT sr_Latn_CS sv_FI Lithuanian_Classic az_Cyrl_AZ dsb_DE se_SE
145+
0x0814, 1252, 0x0816, 1252, 0x081a, 1250, 0x081d, 1252, 0x0827, 1257, 0x082c, 1251, 0x082e, 1252, 0x083b, 1252,
146+
147+
// ga_IE ms_BN uz_Cyrl_UZ bn_BD mn_Mong_CN iu_Latn_CA tzm_Latn_DZ quz_EC
148+
0x083c, 1252, 0x083e, 1252, 0x0843, 1251, 0x0845, 1200, 0x0850, 1251, 0x085d, 1252, 0x085f, 1252, 0x086b, 1252,
149+
150+
// ar_EG zh_HK de_AT en_AU es_ES fr_CA sr_Cyrl_CS se_FI
151+
0x0c01, 1256, 0x0c04, 950, 0x0c07, 1252, 0x0c09, 1252, 0x0c0a, 1252, 0x0c0c, 1252, 0x0c1a, 1251, 0x0c3b, 1252,
152+
153+
// quz_PE ar_LY zh_SG de_LU en_CA es_GT fr_CH hr_BA
154+
0x0c6b, 1252, 0x1001, 1256, 0x1004, 936, 0x1007, 1252, 0x1009, 1252, 0x100a, 1252, 0x100c, 1252, 0x101a, 1250,
155+
156+
// smj_NO ar_DZ zh_MO de_LI en_NZ es_CR fr_LU bs_Latn_BA
157+
0x103b, 1252, 0x1401, 1256, 0x1404, 950, 0x1407, 1252, 0x1409, 1252, 0x140a, 1252, 0x140c, 1252, 0x141a, 1250,
158+
159+
// smj_SE ar_MA en_IE es_PA fr_MC sr_Latn_BA sma_NO ar_TN
160+
0x143b, 1252, 0x1801, 1256, 0x1809, 1252, 0x180a, 1252, 0x180c, 1252, 0x181a, 1250, 0x183b, 1252, 0x1c01, 1256,
161+
162+
// en_ZA es_DO sr_Cyrl_BA sma_SB ar_OM en_JM es_VE bs_Cyrl_BA
163+
0x1c09, 1252, 0x1c0a, 1252, 0x1c1a, 1251, 0x1c3b, 1252, 0x2001, 1256, 0x2009, 1252, 0x200a, 1252, 0x201a, 1251,
164+
165+
// sms_FI ar_YE en_CB es_CO smn_FI ar_SY en_BZ es_PE
166+
0x203b, 1252, 0x2401, 1256, 0x2409, 1252, 0x240a, 1252, 0x243b, 1252, 0x2801, 1256, 0x2809, 1252, 0x280a, 1252,
167+
168+
// ar_JO en_TT es_AR ar_LB en_ZW es_EC ar_KW en_PH
169+
0x2c01, 1256, 0x2c09, 1252, 0x2c0a, 1252, 0x3001, 1256, 0x3009, 1252, 0x300a, 1252, 0x3401, 1256, 0x3409, 1252,
170+
171+
// es_CL ar_AE es_UY ar_BH es_PY ar_QA en_IN es_BO
172+
0x340a, 1252, 0x3801, 1256, 0x380a, 1252, 0x3c01, 1256, 0x3c0a, 1252, 0x4001, 1256, 0x4009, 1252, 0x400a, 1252,
173+
174+
// en_MY es_SV en_SG es_HN es_NI es_PR es_US
175+
0x4409, 1252, 0x440a, 1252, 0x4809, 1252, 0x480a, 1252, 0x4c0a, 1252, 0x500a, 1252, 0x540a, 1252,
176+
];
177+
178+
public static bool TryGetCodePage(int lcid, int sortId, out int codePage)
179+
{
180+
if (sortId != 0)
181+
{
182+
codePage = (uint)sortId < SortIdToCodePageMappings.Length
183+
? SortIdToCodePageMappings[sortId]
184+
: 0;
185+
}
186+
else
187+
{
188+
codePage = GetCodePageByLcid(lcid & 0xFFFF);
189+
}
190+
return codePage != 0;
191+
}
192+
193+
private static int GetCodePageByLcid(int lcid)
194+
{
195+
Debug.Assert(LcidToCodePageMappings.Length == LocaleMappingCount * 2);
196+
197+
ReadOnlySpan<int> lcidMappings = LcidToCodePageMappings;
198+
int mappingIndex = lcidMappings.IndexOf(lcid);
199+
200+
// If LCID is not found, or if it's found at an odd index (which would be a code page, not an LCID), return zero
201+
// to indicate that the code page could not be found.
202+
// Also include an explicit bounds check to ensure that the method doesn't contain any exception paths.
203+
if (mappingIndex == -1 || (mappingIndex % 2) != 0 || ((uint)mappingIndex + 1) >= lcidMappings.Length)
204+
{
205+
return 0;
206+
}
207+
208+
return lcidMappings[mappingIndex + 1];
209+
}
210+
}

0 commit comments

Comments
 (0)