15
15
#define SSE4_1_FUNC
16
16
#define AVX2_FUNC
17
17
#define AVX3_FUNC
18
+ #define AVX512_ICL_FUNC
18
19
#else
19
20
#define SSE4_1_FUNC __attribute__ ((__target__(" sse4.1" )))
20
21
#define AVX2_FUNC __attribute__ ((__target__(" avx2" )))
21
22
#define AVX3_FUNC __attribute__ ((__target__(" avx512f,avx512bw,avx512dq,avx512cd,avx512vl" )))
23
+ #define AVX512_ICL_FUNC __attribute__ ((__target__(" avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq" )))
22
24
#endif // _MSC_VER
23
25
24
- #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
26
+
27
+ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__)
25
28
[[maybe_unused]] constexpr bool s_use_ssse3 = true ;
26
29
[[maybe_unused]] constexpr bool s_use_sse4_1 = true ;
27
30
[[maybe_unused]] constexpr bool s_use_avx2 = true ;
28
31
[[maybe_unused]] constexpr bool s_use_avx3 = true ;
32
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = true ;
33
+ #elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
34
+ [[maybe_unused]] constexpr bool s_use_ssse3 = true ;
35
+ [[maybe_unused]] constexpr bool s_use_sse4_1 = true ;
36
+ [[maybe_unused]] constexpr bool s_use_avx2 = true ;
37
+ [[maybe_unused]] constexpr bool s_use_avx3 = true ;
38
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = false ;
29
39
#elif defined(__AVX2__)
30
40
[[maybe_unused]] constexpr bool s_use_ssse3 = true ;
31
41
[[maybe_unused]] constexpr bool s_use_sse4_1 = true ;
32
42
[[maybe_unused]] constexpr bool s_use_avx2 = true ;
33
43
[[maybe_unused]] constexpr bool s_use_avx3 = false ;
44
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = false ;
34
45
#elif defined(__SSE4_1__)
35
46
[[maybe_unused]] constexpr bool s_use_ssse3 = true ;
36
47
[[maybe_unused]] constexpr bool s_use_sse4_1 = true ;
37
48
[[maybe_unused]] constexpr bool s_use_avx2 = false ;
38
49
[[maybe_unused]] constexpr bool s_use_avx3 = false ;
50
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = false ;
39
51
#elif defined(__SSSE3__)
40
52
[[maybe_unused]] constexpr bool s_use_ssse3 = true ;
41
53
[[maybe_unused]] constexpr bool s_use_sse4_1 = false ;
42
54
[[maybe_unused]] constexpr bool s_use_avx2 = false ;
43
55
[[maybe_unused]] constexpr bool s_use_avx3 = false ;
56
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = false ;
44
57
#elif defined(ARCH_X64)
45
58
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
46
59
[[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41();
47
60
[[maybe_unused]] const bool s_use_avx2 = utils::has_avx2();
48
61
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
62
+ [[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl();
49
63
#else
50
64
[[maybe_unused]] constexpr bool s_use_ssse3 = true ; // Non x86
51
65
[[maybe_unused]] constexpr bool s_use_sse4_1 = true ; // Non x86
52
66
[[maybe_unused]] constexpr bool s_use_avx2 = false ;
53
67
[[maybe_unused]] constexpr bool s_use_avx3 = false ;
68
+ [[maybe_unused]] constexpr bool s_use_avx512_icl = false ;
54
69
#endif
55
70
56
71
const v128 s_bswap_u32_mask = v128::from32(0x00010203 , 0x04050607 , 0x08090a0b , 0x0c0d0e0f );
@@ -404,6 +419,153 @@ namespace
404
419
}
405
420
};
406
421
422
+
423
+
424
+ #if defined(ARCH_X64)
425
+
426
+ SSE4_1_FUNC static inline u16 sse41_hmin_epu16 (__m128i x)
427
+ {
428
+ return _mm_cvtsi128_si32 (_mm_minpos_epu16 (x));
429
+ }
430
+
431
+ SSE4_1_FUNC static inline u16 sse41_hmax_epu16 (__m128i x)
432
+ {
433
+ return ~_mm_cvtsi128_si32 (_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1 ))));
434
+ }
435
+
436
+ AVX512_ICL_FUNC
437
+ static
438
+ std::tuple<u16, u16, u32> upload_u16_swapped_avx512_icl_skip_restart (const void *src, void *dst, u32 count, u16 restart_index)
439
+ {
440
+ const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2 (s_bswap_u16_mask);
441
+
442
+ auto src_stream = static_cast <const __m512*>(src);
443
+ auto dst_stream = static_cast <u16 *>(dst);
444
+
445
+ __m512i restart = _mm512_set1_epi16 (restart_index);
446
+ __m512i min = _mm512_set1_epi16 (-1 );
447
+ __m512i max = _mm512_set1_epi16 (0 );
448
+ const __m512i ones = _mm512_set1_epi16 (-1 );
449
+
450
+ int written = 0 ;
451
+
452
+ const auto iterations = count / 32 ;
453
+ for (u32 i = 0 ; i < iterations; i++)
454
+ {
455
+ const __m512i raw = _mm512_loadu_si512 (src_stream++);
456
+ const __m512i value = _mm512_shuffle_epi8 (raw, s_bswap_u16_mask512);
457
+ const __mmask32 mask = _mm512_cmpneq_epi16_mask (restart, value);
458
+ const __m512i value_with_max_restart = _mm512_mask_blend_epi16 (mask, ones, value);
459
+
460
+ max = _mm512_mask_max_epu16 (max, mask, max, value);
461
+ min = _mm512_mask_min_epu16 (min, mask, min, value);
462
+ const __m512i packed = _mm512_maskz_compress_epi16 (mask, value_with_max_restart);
463
+
464
+ int processed = _mm_popcnt_u32 (mask);
465
+ _mm512_storeu_si512 (dst_stream, packed);
466
+ dst_stream += processed;
467
+ written += processed;
468
+ }
469
+
470
+ u32 remainder = count % 32 ;
471
+ if (remainder > 0 )
472
+ {
473
+ __mmask32 rem_mask = (1U << remainder ) - 1 ;
474
+ __m512i raw = _mm512_maskz_loadu_epi16 (rem_mask, src_stream);
475
+ __m512i value = _mm512_shuffle_epi8 (raw, s_bswap_u16_mask512);
476
+ __mmask32 mask = _mm512_mask_cmpneq_epi16_mask (rem_mask, restart, value);
477
+
478
+ __m512i value_with_max_restart = _mm512_mask_blend_epi16 (mask, ones, value);
479
+ max = _mm512_mask_max_epu16 (max, mask, max, value);
480
+ min = _mm512_mask_min_epu16 (min, mask, min, value);
481
+ __m512i packed = _mm512_maskz_compress_epi16 (mask, value_with_max_restart);
482
+
483
+ int processed = _mm_popcnt_u32 (mask);
484
+ __mmask32 store_mask = (1U << processed) - 1 ;
485
+ _mm512_mask_storeu_epi16 (dst_stream, store_mask, packed);
486
+ written += processed;
487
+ }
488
+
489
+ __m256i tmp256 = _mm512_extracti64x4_epi64 (min, 1 );
490
+ __m256i min2 = _mm512_castsi512_si256 (min);
491
+ min2 = _mm256_min_epu16 (min2, tmp256);
492
+ __m128i tmp = _mm256_extracti128_si256 (min2, 1 );
493
+ __m128i min3 = _mm256_castsi256_si128 (min2);
494
+ min3 = _mm_min_epu16 (min3, tmp);
495
+
496
+ tmp256 = _mm512_extracti64x4_epi64 (max, 1 );
497
+ __m256i max2 = _mm512_castsi512_si256 (max);
498
+ max2 = _mm256_max_epu16 (max2, tmp256);
499
+ tmp = _mm256_extracti128_si256 (max2, 1 );
500
+ __m128i max3 = _mm256_castsi256_si128 (max2);
501
+ max3 = _mm_max_epu16 (max3, tmp);
502
+
503
+ const u16 min_index = sse41_hmin_epu16 (min3);
504
+ const u16 max_index = sse41_hmax_epu16 (max3);
505
+
506
+ return std::make_tuple (min_index, max_index, written);
507
+ }
508
+
509
+ AVX3_FUNC
510
+ static
511
+ std::tuple<u32, u32, u32> upload_u32_swapped_avx3_skip_restart (const void *src, void *dst, u32 count, u32 restart_index)
512
+ {
513
+ const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4 (s_bswap_u32_mask);
514
+
515
+ auto src_stream = static_cast <const __m512i*>(src);
516
+ auto dst_stream = static_cast <u32 *>(dst);
517
+
518
+ __m512i restart = _mm512_set1_epi32 (restart_index);
519
+ __m512i min = _mm512_set1_epi32 (-1 );
520
+ __m512i max = _mm512_set1_epi32 (0 );
521
+ const __m512i ones = _mm512_set1_epi32 (-1 );
522
+
523
+ int written = 0 ;
524
+
525
+ const u32 iterations = count / 16 ;
526
+ for (u32 i = 0 ; i < iterations; i++)
527
+ {
528
+ const __m512i raw = _mm512_loadu_si512 (src_stream++);
529
+ const __m512i value = _mm512_shuffle_epi8 (raw, s_bswap_u32_mask512);
530
+ const __mmask16 mask = _mm512_cmpneq_epi32_mask (restart, value);
531
+ const __m512i value_with_max_restart = _mm512_mask_blend_epi32 (mask, ones, value);
532
+
533
+ max = _mm512_mask_max_epu32 (max, mask, max, value);
534
+ min = _mm512_mask_min_epu32 (min, mask, min, value);
535
+ const __m512i packed = _mm512_maskz_compress_epi32 (mask, value_with_max_restart);
536
+
537
+ int processed = _mm_popcnt_u32 (mask);
538
+ _mm512_storeu_si512 (dst_stream, packed);
539
+ dst_stream += processed;
540
+ written += processed;
541
+ }
542
+
543
+ u32 remainder = count % 16 ;
544
+ if (remainder > 0 )
545
+ {
546
+ __mmask16 rem_mask = (1U << remainder ) - 1 ;
547
+ __m512i raw = _mm512_maskz_loadu_epi32 (rem_mask, src_stream);
548
+ __m512i value = _mm512_shuffle_epi8 (raw, s_bswap_u32_mask512);
549
+
550
+ __mmask16 mask = _mm512_mask_cmpneq_epi32_mask (rem_mask, restart, value);
551
+ __m512i value_with_max_restart = _mm512_mask_blend_epi32 (mask, ones, value);
552
+ max = _mm512_mask_max_epu32 (max, mask, max, value);
553
+ min = _mm512_mask_min_epu32 (min, mask, min, value);
554
+ __m512i packed = _mm512_maskz_compress_epi32 (mask, value_with_max_restart);
555
+
556
+ int processed = _mm_popcnt_u32 (mask);
557
+ __mmask16 store_mask = (1U << processed) - 1 ;
558
+ _mm512_mask_storeu_epi32 (dst_stream, store_mask, packed);
559
+ written += processed;
560
+ }
561
+
562
+ u32 min_index = _mm512_reduce_min_epu32 (min);
563
+ u32 max_index = _mm512_reduce_max_epu32 (max);
564
+
565
+ return std::make_tuple (min_index, max_index, written);
566
+ }
567
+ #endif
568
+
407
569
template <typename T>
408
570
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart (std::span<to_be_t <const T>> src, std::span<T> dst, T restart_index)
409
571
{
@@ -412,6 +574,24 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
412
574
u32 written = 0 ;
413
575
u32 length = ::size32 (src);
414
576
577
+ if constexpr (std::is_same_v<T, u16>)
578
+ {
579
+ if (s_use_avx512_icl)
580
+ {
581
+ std::tie (min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart (src.data (), dst.data (), length, restart_index);
582
+ return std::make_tuple (min_index, max_index, written);
583
+ }
584
+ }
585
+
586
+ if constexpr (std::is_same_v<T, u32>)
587
+ {
588
+ if (s_use_avx3)
589
+ {
590
+ std::tie (min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart (src.data (), dst.data (), length, restart_index);
591
+ return std::make_tuple (min_index, max_index, written);
592
+ }
593
+ }
594
+
415
595
for (u32 i = written; i < length; ++i)
416
596
{
417
597
T index = src[i];
0 commit comments