Skip to content

Commit d716840

Browse files
author
Ubuntu
committed
fix: optimize the ARM function for systems with weak SIMD performance
1 parent 6f92b06 commit d716840

File tree

1 file changed

+54
-13
lines changed

1 file changed

+54
-13
lines changed

src/UTF8.cs

+54-13
Original file line numberDiff line numberDiff line change
@@ -1277,7 +1277,18 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
12771277
}
12781278
return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
12791279
}
1280-
1280+
public static void ToString(Vector128<byte> v)
1281+
{
1282+
Span<byte> b = stackalloc byte[16];
1283+
v.CopyTo(b);
1284+
Console.WriteLine(Convert.ToHexString(b));
1285+
}
1286+
public static void ToString(Vector128<sbyte> v)
1287+
{
1288+
Span<byte> b = stackalloc byte[16];
1289+
v.AsByte().CopyTo(b);
1290+
Console.WriteLine(Convert.ToHexString(b));
1291+
}
12811292
public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
12821293
{
12831294
int processedLength = 0;
@@ -1360,18 +1371,31 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
13601371
// The block goes from processedLength to processedLength/16*16.
13611372
int contbytes = 0; // number of continuation bytes in the block
13621373
int n4 = 0; // number of 4-byte sequences that start in this block
1374+
/////
1375+
// Design:
1376+
// Instead of updating n4 and contbytes continuously, we accumulate
1377+
// the values in n4v and contv, while using overflowCounter to make
1378+
// sure we do not overflow. This allows you to reach good performance
1379+
// on systems where summing across vectors is slow.
1380+
////
1381+
Vector128<sbyte> n4v = Vector128<sbyte>.Zero;
1382+
Vector128<sbyte> contv = Vector128<sbyte>.Zero;
1383+
int overflowCounter = 0;
13631384
for (; processedLength + 16 <= inputLength; processedLength += 16)
13641385
{
13651386

13661387
Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
13671388
if ((currentBlock & v80) == Vector128<byte>.Zero)
1368-
// We could also use (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
1369-
// hardware.
13701389
{
13711390
// We have an ASCII block, no need to process it, but
13721391
// we need to check if the previous block was incomplete.
13731392
if (prevIncomplete != Vector128<byte>.Zero)
13741393
{
1394+
contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
1395+
if (n4v != Vector128<sbyte>.Zero)
1396+
{
1397+
n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
1398+
}
13751399
int off = processedLength >= 3 ? processedLength - 3 : processedLength;
13761400
byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
13771401
// So the code is correct up to invalidBytePointer
@@ -1432,11 +1456,13 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
14321456
Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte);
14331457
Vector128<byte> must23As80 = AdvSimd.And(must23, v80);
14341458
Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
1435-
// AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
1436-
// than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
1437-
// hardware:
14381459
if (error != Vector128<byte>.Zero)
14391460
{
1461+
contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
1462+
if (n4v != Vector128<sbyte>.Zero)
1463+
{
1464+
n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
1465+
}
14401466
byte* invalidBytePointer;
14411467
if (processedLength == 0)
14421468
{
@@ -1459,17 +1485,32 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
14591485
return invalidBytePointer;
14601486
}
14611487
prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
1462-
contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar();
1463-
Vector128<byte> largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
1464-
if (largerthan0f != Vector128<byte>.Zero)
1488+
contv += AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
1489+
n4v += AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
1490+
overflowCounter++;
1491+
// We have a risk of overflow if overflowCounter reaches 255,
1492+
// in which case, we empty contv and n4v, and update contbytes and
1493+
// n4.
1494+
if (overflowCounter == 0xff)
14651495
{
1466-
byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
1467-
int negn4add = (int)(byte)-n4add;
1468-
n4 += negn4add;
1496+
overflowCounter = 0;
1497+
contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
1498+
contv = Vector128<sbyte>.Zero;
1499+
if (n4v != Vector128<sbyte>.Zero)
1500+
{
1501+
n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
1502+
n4v = Vector128<sbyte>.Zero;
1503+
}
14691504
}
14701505
}
14711506
}
1472-
bool hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
1507+
contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
1508+
if (n4v != Vector128<sbyte>.Zero)
1509+
{
1510+
n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
1511+
}
1512+
1513+
bool hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
14731514
if (processedLength < inputLength || hasIncompete)
14741515
{
14751516
byte* invalidBytePointer;

0 commit comments

Comments
 (0)