@@ -1277,7 +1277,18 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
1277
1277
}
1278
1278
return GetPointerToFirstInvalidByteScalar ( pInputBuffer + processedLength , inputLength - processedLength , out utf16CodeUnitCountAdjustment , out scalarCountAdjustment ) ;
1279
1279
}
1280
-
1280
+ public static void ToString ( Vector128 < byte > v )
1281
+ {
1282
+ Span < byte > b = stackalloc byte [ 16 ] ;
1283
+ v . CopyTo ( b ) ;
1284
+ Console . WriteLine ( Convert . ToHexString ( b ) ) ;
1285
+ }
1286
+ public static void ToString ( Vector128 < sbyte > v )
1287
+ {
1288
+ Span < byte > b = stackalloc byte [ 16 ] ;
1289
+ v . AsByte ( ) . CopyTo ( b ) ;
1290
+ Console . WriteLine ( Convert . ToHexString ( b ) ) ;
1291
+ }
1281
1292
public unsafe static byte * GetPointerToFirstInvalidByteArm64 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
1282
1293
{
1283
1294
int processedLength = 0 ;
@@ -1360,18 +1371,31 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
1360
1371
// The block goes from processedLength to processedLength/16*16.
1361
1372
int contbytes = 0 ; // number of continuation bytes in the block
1362
1373
int n4 = 0 ; // number of 4-byte sequences that start in this block
1374
+ /////
1375
+ // Design:
1376
+ // Instead of updating n4 and contbytes continuously, we accumulate
1377
+ // the values in n4v and contv, while using overflowCounter to make
1378
+ // sure we do not overflow. This allows you to reach good performance
1379
+ // on systems where summing across vectors is slow.
1380
+ ////
1381
+ Vector128 < sbyte > n4v = Vector128 < sbyte > . Zero ;
1382
+ Vector128 < sbyte > contv = Vector128 < sbyte > . Zero ;
1383
+ int overflowCounter = 0 ;
1363
1384
for ( ; processedLength + 16 <= inputLength ; processedLength += 16 )
1364
1385
{
1365
1386
1366
1387
Vector128 < byte > currentBlock = AdvSimd . LoadVector128 ( pInputBuffer + processedLength ) ;
1367
1388
if ( ( currentBlock & v80 ) == Vector128 < byte > . Zero )
1368
- // We could also use (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
1369
- // hardware.
1370
1389
{
1371
1390
// We have an ASCII block, no need to process it, but
1372
1391
// we need to check if the previous block was incomplete.
1373
1392
if ( prevIncomplete != Vector128 < byte > . Zero )
1374
1393
{
1394
+ contbytes += - AdvSimd . Arm64 . AddAcrossWidening ( contv ) . ToScalar ( ) ;
1395
+ if ( n4v != Vector128 < sbyte > . Zero )
1396
+ {
1397
+ n4 += - AdvSimd . Arm64 . AddAcrossWidening ( n4v ) . ToScalar ( ) ;
1398
+ }
1375
1399
int off = processedLength >= 3 ? processedLength - 3 : processedLength ;
1376
1400
byte * invalidBytePointer = SimdUnicode . UTF8 . SimpleRewindAndValidateWithErrors ( 16 - 3 , pInputBuffer + processedLength - 3 , inputLength - processedLength + 3 ) ;
1377
1401
// So the code is correct up to invalidBytePointer
@@ -1432,11 +1456,13 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
1432
1456
Vector128 < byte > must23 = AdvSimd . Or ( isThirdByte , isFourthByte ) ;
1433
1457
Vector128 < byte > must23As80 = AdvSimd . And ( must23 , v80 ) ;
1434
1458
Vector128 < byte > error = AdvSimd . Xor ( must23As80 , sc ) ;
1435
- // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
1436
- // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
1437
- // hardware:
1438
1459
if ( error != Vector128 < byte > . Zero )
1439
1460
{
1461
+ contbytes += - AdvSimd . Arm64 . AddAcrossWidening ( contv ) . ToScalar ( ) ;
1462
+ if ( n4v != Vector128 < sbyte > . Zero )
1463
+ {
1464
+ n4 += - AdvSimd . Arm64 . AddAcrossWidening ( n4v ) . ToScalar ( ) ;
1465
+ }
1440
1466
byte * invalidBytePointer ;
1441
1467
if ( processedLength == 0 )
1442
1468
{
@@ -1459,17 +1485,32 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
1459
1485
return invalidBytePointer ;
1460
1486
}
1461
1487
prevIncomplete = AdvSimd . SubtractSaturate ( currentBlock , maxValue ) ;
1462
- contbytes += - AdvSimd . Arm64 . AddAcross ( AdvSimd . CompareLessThanOrEqual ( Vector128 . AsSByte ( currentBlock ) , largestcont ) ) . ToScalar ( ) ;
1463
- Vector128 < byte > largerthan0f = AdvSimd . CompareGreaterThan ( currentBlock , fourthByteMinusOne ) ;
1464
- if ( largerthan0f != Vector128 < byte > . Zero )
1488
+ contv += AdvSimd . CompareLessThanOrEqual ( Vector128 . AsSByte ( currentBlock ) , largestcont ) ;
1489
+ n4v += AdvSimd . CompareGreaterThan ( currentBlock , fourthByteMinusOne ) . AsSByte ( ) ;
1490
+ overflowCounter ++ ;
1491
+ // We have a risk of overflow if overflowCounter reaches 255,
1492
+ // in which case, we empty contv and n4v, and update contbytes and
1493
+ // n4.
1494
+ if ( overflowCounter == 0xff )
1465
1495
{
1466
- byte n4add = ( byte ) AdvSimd . Arm64 . AddAcross ( largerthan0f ) . ToScalar ( ) ;
1467
- int negn4add = ( int ) ( byte ) - n4add ;
1468
- n4 += negn4add ;
1496
+ overflowCounter = 0 ;
1497
+ contbytes += - AdvSimd . Arm64 . AddAcrossWidening ( contv ) . ToScalar ( ) ;
1498
+ contv = Vector128 < sbyte > . Zero ;
1499
+ if ( n4v != Vector128 < sbyte > . Zero )
1500
+ {
1501
+ n4 += - AdvSimd . Arm64 . AddAcrossWidening ( n4v ) . ToScalar ( ) ;
1502
+ n4v = Vector128 < sbyte > . Zero ;
1503
+ }
1469
1504
}
1470
1505
}
1471
1506
}
1472
- bool hasIncompete = ( prevIncomplete != Vector128 < byte > . Zero ) ;
1507
+ contbytes += - AdvSimd . Arm64 . AddAcrossWidening ( contv ) . ToScalar ( ) ;
1508
+ if ( n4v != Vector128 < sbyte > . Zero )
1509
+ {
1510
+ n4 += - AdvSimd . Arm64 . AddAcrossWidening ( n4v ) . ToScalar ( ) ;
1511
+ }
1512
+
1513
+ bool hasIncompete = ( prevIncomplete != Vector128 < byte > . Zero ) ;
1473
1514
if ( processedLength < inputLength || hasIncompete )
1474
1515
{
1475
1516
byte * invalidBytePointer ;
0 commit comments