Skip to content

Commit 05f51aa

Browse files
committed
clean up
1 parent dfd719c commit 05f51aa

File tree

1 file changed

+70
-189
lines changed

1 file changed

+70
-189
lines changed

src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs

+70-189
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,6 @@ internal static unsafe partial class Utf8Utility
2323
/// </remarks>
2424
public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
2525
{
26-
if (AdvSimd.Arm64.IsSupported)
27-
{
28-
return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
29-
}
30-
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && Popcnt.X64.IsSupported)
31-
{
32-
return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
33-
}
34-
if (Avx2.IsSupported && Popcnt.X64.IsSupported)
35-
{
36-
return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
37-
}
38-
3926
Debug.Assert(inputLength >= 0, "Input length must not be negative.");
4027
Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
4128

@@ -54,12 +41,39 @@ internal static unsafe partial class Utf8Utility
5441
return pInputBuffer;
5542
}
5643

44+
if (AdvSimd.Arm64.IsSupported)
45+
{
46+
return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
47+
}
48+
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && Popcnt.X64.IsSupported)
49+
{
50+
return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
51+
}
52+
if (Avx2.IsSupported && Popcnt.X64.IsSupported)
53+
{
54+
return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
55+
}
56+
return GetPointerToFirstInvalidByte_Default(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
57+
}
58+
59+
// Returns &inputBuffer[inputLength] if the input buffer is valid.
60+
/// <summary>
61+
/// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
62+
/// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
63+
/// </summary>
64+
/// <remarks>
65+
/// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
66+
/// </remarks>
67+
private static byte* GetPointerToFirstInvalidByte_Default(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
68+
{
69+
Debug.Assert(inputLength >= 0, "Input length must not be negative.");
70+
Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");
71+
5772
#if DEBUG
5873
// Keep these around for final validation at the end of the method.
5974
byte* pOriginalInputBuffer = pInputBuffer;
6075
int originalInputLength = inputLength;
6176
#endif
62-
6377
// Enregistered locals that we'll eventually out to our caller.
6478

6579
int tempUtf16CodeUnitCountAdjustment = 0;
@@ -792,18 +806,19 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
792806
{
793807
// We skip any ASCII characters at the start of the buffer
794808
int asciirun = 0;
795-
for (; asciirun + 64 <= inputLength; asciirun += 64)
796-
{
797-
Vector128<byte> block1 = Vector128.Load(pInputBuffer + asciirun);
798-
Vector128<byte> block2 = Vector128.Load(pInputBuffer + asciirun + 16);
799-
Vector128<byte> block3 = Vector128.Load(pInputBuffer + asciirun + 32);
800-
Vector128<byte> block4 = Vector128.Load(pInputBuffer + asciirun + 48);
801-
Vector128<byte> or = (block1 | block2) | (block3 | block4);
802-
if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127)
803-
{
804-
break;
805-
}
806-
}
809+
//for (; asciirun + 64 <= inputLength; asciirun += 64)
810+
//{
811+
// Vector128<byte> block1 = Vector128.Load(pInputBuffer + asciirun);
812+
// Vector128<byte> block2 = Vector128.Load(pInputBuffer + asciirun + 16);
813+
// Vector128<byte> block3 = Vector128.Load(pInputBuffer + asciirun + 32);
814+
// Vector128<byte> block4 = Vector128.Load(pInputBuffer + asciirun + 48);
815+
// Vector128<byte> or = (block1 | block2) | (block3 | block4);
816+
// if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127)
817+
// {
818+
// break;
819+
// }
820+
//}
821+
// NOTE: input's first byte is non-ascii already
807822
processedLength = asciirun;
808823

809824
if (processedLength + 32 < inputLength)
@@ -981,9 +996,10 @@ private static ulong GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bit
981996
return pInputBuffer + inputLength;
982997
}
983998
}
984-
return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
999+
return GetPointerToFirstInvalidByte_Default(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
9851000
}
9861001

1002+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
9871003
private static void RemoveCounters(byte* start, byte* end, ref int n4, ref int contbytes)
9881004
{
9891005
for (byte* p = start; p < end; p++)
@@ -999,6 +1015,7 @@ private static void RemoveCounters(byte* start, byte* end, ref int n4, ref int c
9991015
}
10001016
}
10011017

1018+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
10021019
private static void AddCounters(byte* start, byte* end, ref int n4, ref int contbytes)
10031020
{
10041021
for (byte* p = start; p < end; p++)
@@ -1138,6 +1155,7 @@ private static void AddCounters(byte* start, byte* end, ref int n4, ref int cont
11381155
return buf + len; // no error
11391156
}
11401157

1158+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
11411159
private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustments(int n4, int contbytes)
11421160
{
11431161
int n3 = -2 * n4 + 2 * contbytes;
@@ -1147,145 +1165,6 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
11471165
return (utfadjust, scalaradjust);
11481166
}
11491167

1150-
private static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
1151-
{
1152-
int TempUtf16CodeUnitCountAdjustment = 0;
1153-
int TempScalarCountAdjustment = 0;
1154-
1155-
int pos = 0;
1156-
int nextPos;
1157-
uint codePoint = 0;
1158-
1159-
while (pos < inputLength)
1160-
{
1161-
1162-
byte firstByte = pInputBuffer[pos];
1163-
while (firstByte < 0b10000000)
1164-
{
1165-
if (++pos == inputLength)
1166-
{
1167-
1168-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1169-
scalarCountAdjustment = TempScalarCountAdjustment;
1170-
return pInputBuffer + inputLength;
1171-
}
1172-
firstByte = pInputBuffer[pos];
1173-
}
1174-
1175-
if ((firstByte & 0b11100000) == 0b11000000)
1176-
{
1177-
nextPos = pos + 2;
1178-
if (nextPos > inputLength)
1179-
{
1180-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1181-
scalarCountAdjustment = TempScalarCountAdjustment;
1182-
return pInputBuffer + pos;
1183-
} // Too short
1184-
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000)
1185-
{
1186-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1187-
scalarCountAdjustment = TempScalarCountAdjustment;
1188-
return pInputBuffer + pos;
1189-
} // Too short
1190-
// range check
1191-
codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(pInputBuffer[pos + 1] & 0b00111111);
1192-
if ((codePoint < 0x80) || (0x7ff < codePoint))
1193-
{
1194-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1195-
scalarCountAdjustment = TempScalarCountAdjustment;
1196-
return pInputBuffer + pos;
1197-
} // Overlong
1198-
TempUtf16CodeUnitCountAdjustment -= 1;
1199-
}
1200-
else if ((firstByte & 0b11110000) == 0b11100000)
1201-
{
1202-
nextPos = pos + 3;
1203-
if (nextPos > inputLength)
1204-
{
1205-
1206-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1207-
scalarCountAdjustment = TempScalarCountAdjustment;
1208-
return pInputBuffer + pos;
1209-
} // Too short
1210-
// range check
1211-
codePoint = (uint)(firstByte & 0b00001111) << 12 |
1212-
(uint)(pInputBuffer[pos + 1] & 0b00111111) << 6 |
1213-
(uint)(pInputBuffer[pos + 2] & 0b00111111);
1214-
// Either overlong or too large:
1215-
if ((codePoint < 0x800) || (0xffff < codePoint) ||
1216-
(0xd7ff < codePoint && codePoint < 0xe000))
1217-
{
1218-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1219-
scalarCountAdjustment = TempScalarCountAdjustment;
1220-
return pInputBuffer + pos;
1221-
}
1222-
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000)
1223-
{
1224-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1225-
scalarCountAdjustment = TempScalarCountAdjustment;
1226-
return pInputBuffer + pos;
1227-
} // Too short
1228-
if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000)
1229-
{
1230-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1231-
scalarCountAdjustment = TempScalarCountAdjustment;
1232-
return pInputBuffer + pos;
1233-
} // Too short
1234-
TempUtf16CodeUnitCountAdjustment -= 2;
1235-
}
1236-
else if ((firstByte & 0b11111000) == 0b11110000)
1237-
{
1238-
nextPos = pos + 4;
1239-
if (nextPos > inputLength)
1240-
{
1241-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1242-
scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos;
1243-
}
1244-
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000)
1245-
{
1246-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1247-
scalarCountAdjustment = TempScalarCountAdjustment;
1248-
return pInputBuffer + pos;
1249-
}
1250-
if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000)
1251-
{
1252-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1253-
scalarCountAdjustment = TempScalarCountAdjustment;
1254-
return pInputBuffer + pos;
1255-
}
1256-
if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000)
1257-
{
1258-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1259-
scalarCountAdjustment = TempScalarCountAdjustment;
1260-
return pInputBuffer + pos;
1261-
}
1262-
// range check
1263-
codePoint =
1264-
(uint)(firstByte & 0b00000111) << 18 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 12 |
1265-
(uint)(pInputBuffer[pos + 2] & 0b00111111) << 6 | (uint)(pInputBuffer[pos + 3] & 0b00111111);
1266-
if (codePoint <= 0xffff || 0x10ffff < codePoint)
1267-
{
1268-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1269-
scalarCountAdjustment = TempScalarCountAdjustment;
1270-
return pInputBuffer + pos;
1271-
}
1272-
TempUtf16CodeUnitCountAdjustment -= 2;
1273-
TempScalarCountAdjustment -= 1;
1274-
}
1275-
else
1276-
{
1277-
// we may have a continuation/too long error
1278-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1279-
scalarCountAdjustment = TempScalarCountAdjustment;
1280-
return pInputBuffer + pos;
1281-
}
1282-
pos = nextPos;
1283-
}
1284-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
1285-
scalarCountAdjustment = TempScalarCountAdjustment;
1286-
return pInputBuffer + inputLength;
1287-
}
1288-
12891168
[CompExactlyDependsOn(typeof(Avx2))]
12901169
[CompExactlyDependsOn(typeof(Popcnt.X64))]
12911170
private static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
@@ -1301,16 +1180,17 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
13011180
{
13021181
// We skip any ASCII characters at the start of the buffer
13031182
int asciirun = 0;
1304-
for (; asciirun + 64 <= inputLength; asciirun += 64)
1305-
{
1306-
Vector256<byte> block1 = Avx.LoadVector256(pInputBuffer + asciirun);
1307-
Vector256<byte> block2 = Avx.LoadVector256(pInputBuffer + asciirun + 32);
1308-
Vector256<byte> or = Avx2.Or(block1, block2);
1309-
if (Avx2.MoveMask(or) != 0)
1310-
{
1311-
break;
1312-
}
1313-
}
1183+
//for (; asciirun + 64 <= inputLength; asciirun += 64)
1184+
//{
1185+
// Vector256<byte> block1 = Avx.LoadVector256(pInputBuffer + asciirun);
1186+
// Vector256<byte> block2 = Avx.LoadVector256(pInputBuffer + asciirun + 32);
1187+
// Vector256<byte> or = Avx2.Or(block1, block2);
1188+
// if (Avx2.MoveMask(or) != 0)
1189+
// {
1190+
// break;
1191+
// }
1192+
//}
1193+
// NOTE: input's first byte is non-ascii already
13141194
processedLength = asciirun;
13151195

13161196
if (processedLength + 32 < inputLength)
@@ -1434,7 +1314,7 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
14341314
{
14351315
// We have an ASCII block, no need to process it, but
14361316
// we need to check if the previous block was incomplete.
1437-
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
1317+
if (!Avx.TestZ(prevIncomplete, prevIncomplete))
14381318
{
14391319
byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
14401320
// So the code is correct up to invalidBytePointer
@@ -1552,7 +1432,7 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
15521432
return pInputBuffer + inputLength;
15531433
}
15541434
}
1555-
return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
1435+
return GetPointerToFirstInvalidByte_Default(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
15561436
}
15571437

15581438
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
@@ -1572,16 +1452,17 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
15721452
// We skip any ASCII characters at the start of the buffer
15731453
// We intentionally use AVX2 instead of AVX-512.
15741454
int asciirun = 0;
1575-
for (; asciirun + 64 <= inputLength; asciirun += 64)
1576-
{
1577-
Vector256<byte> block1 = Avx.LoadVector256(pInputBuffer + asciirun);
1578-
Vector256<byte> block2 = Avx.LoadVector256(pInputBuffer + asciirun + 32);
1579-
Vector256<byte> or = Avx2.Or(block1, block2);
1580-
if (Avx2.MoveMask(or) != 0)
1581-
{
1582-
break;
1583-
}
1584-
}
1455+
//for (; asciirun + 64 <= inputLength; asciirun += 64)
1456+
//{
1457+
// Vector256<byte> block1 = Avx.LoadVector256(pInputBuffer + asciirun);
1458+
// Vector256<byte> block2 = Avx.LoadVector256(pInputBuffer + asciirun + 32);
1459+
// Vector256<byte> or = Avx2.Or(block1, block2);
1460+
// if (Avx2.MoveMask(or) != 0)
1461+
// {
1462+
// break;
1463+
// }
1464+
//}
1465+
// NOTE: input's first byte is non-ascii already
15851466
processedLength = asciirun;
15861467

15871468
if (processedLength + 64 < inputLength)
@@ -1880,7 +1761,7 @@ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustmen
18801761
return pInputBuffer + inputLength;
18811762
}
18821763
}
1883-
return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
1764+
return GetPointerToFirstInvalidByte_Default(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
18841765
}
18851766
}
18861767
}

0 commit comments

Comments
 (0)