Skip to content

Commit a26e34b

Browse files
committed
BufferUtils: Optimize upload_untoucheed_skip_restart with AVX-512 paths
- u16 path needs AVX-512-ICL because vpcompressw isn't included in skylake-x level AVX-512 - the u32 path is untested as I couldn't find any games that hit it
1 parent faef63e commit a26e34b

File tree

1 file changed

+181
-1
lines changed

1 file changed

+181
-1
lines changed

rpcs3/Emu/RSX/Common/BufferUtils.cpp

+181-1
Original file line numberDiff line numberDiff line change
@@ -15,42 +15,57 @@
1515
#define SSE4_1_FUNC
1616
#define AVX2_FUNC
1717
#define AVX3_FUNC
18+
#define AVX512_ICL_FUNC
1819
#else
1920
#define SSE4_1_FUNC __attribute__((__target__("sse4.1")))
2021
#define AVX2_FUNC __attribute__((__target__("avx2")))
2122
#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl")))
23+
#define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
2224
#endif // _MSC_VER
2325

24-
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
26+
27+
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) && defined(__AVX512BITALG__) && defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && defined(__AVX512VBMI2__) && defined(__AVX512VNNI__) && defined(__AVX512VPOPCNTDQ__)
2528
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
2629
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
2730
[[maybe_unused]] constexpr bool s_use_avx2 = true;
2831
[[maybe_unused]] constexpr bool s_use_avx3 = true;
32+
[[maybe_unused]] constexpr bool s_use_avx512_icl = true;
33+
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__)
34+
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
35+
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
36+
[[maybe_unused]] constexpr bool s_use_avx2 = true;
37+
[[maybe_unused]] constexpr bool s_use_avx3 = true;
38+
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
2939
#elif defined(__AVX2__)
3040
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
3141
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
3242
[[maybe_unused]] constexpr bool s_use_avx2 = true;
3343
[[maybe_unused]] constexpr bool s_use_avx3 = false;
44+
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
3445
#elif defined(__SSE4_1__)
3546
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
3647
[[maybe_unused]] constexpr bool s_use_sse4_1 = true;
3748
[[maybe_unused]] constexpr bool s_use_avx2 = false;
3849
[[maybe_unused]] constexpr bool s_use_avx3 = false;
50+
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
3951
#elif defined(__SSSE3__)
4052
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
4153
[[maybe_unused]] constexpr bool s_use_sse4_1 = false;
4254
[[maybe_unused]] constexpr bool s_use_avx2 = false;
4355
[[maybe_unused]] constexpr bool s_use_avx3 = false;
56+
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
4457
#elif defined(ARCH_X64)
4558
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
4659
[[maybe_unused]] const bool s_use_sse4_1 = utils::has_sse41();
4760
[[maybe_unused]] const bool s_use_avx2 = utils::has_avx2();
4861
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
62+
[[maybe_unused]] const bool s_use_avx512_icl = utils::has_avx512_icl();
4963
#else
5064
[[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86
5165
[[maybe_unused]] constexpr bool s_use_sse4_1 = true; // Non x86
5266
[[maybe_unused]] constexpr bool s_use_avx2 = false;
5367
[[maybe_unused]] constexpr bool s_use_avx3 = false;
68+
[[maybe_unused]] constexpr bool s_use_avx512_icl = false;
5469
#endif
5570

5671
const v128 s_bswap_u32_mask = v128::from32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
@@ -404,6 +419,153 @@ namespace
404419
}
405420
};
406421

422+
423+
424+
#if defined(ARCH_X64)
425+
426+
SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x)
427+
{
428+
return _mm_cvtsi128_si32(_mm_minpos_epu16(x));
429+
}
430+
431+
SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
432+
{
433+
return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1))));
434+
}
435+
436+
AVX512_ICL_FUNC
437+
static
438+
std::tuple<u16, u16, u32> upload_u16_swapped_avx512_icl_skip_restart(const void *src, void *dst, u32 count, u16 restart_index)
439+
{
440+
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);
441+
442+
auto src_stream = static_cast<const __m512*>(src);
443+
auto dst_stream = static_cast<u16 *>(dst);
444+
445+
__m512i restart = _mm512_set1_epi16(restart_index);
446+
__m512i min = _mm512_set1_epi16(-1);
447+
__m512i max = _mm512_set1_epi16(0);
448+
const __m512i ones = _mm512_set1_epi16(-1);
449+
450+
int written = 0;
451+
452+
const auto iterations = count / 32;
453+
for (u32 i = 0; i < iterations; i++)
454+
{
455+
const __m512i raw = _mm512_loadu_si512(src_stream++);
456+
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
457+
const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value);
458+
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
459+
460+
max = _mm512_mask_max_epu16(max, mask, max, value);
461+
min = _mm512_mask_min_epu16(min, mask, min, value);
462+
const __m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
463+
464+
int processed = _mm_popcnt_u32(mask);
465+
_mm512_storeu_si512(dst_stream, packed);
466+
dst_stream += processed;
467+
written += processed;
468+
}
469+
470+
u32 remainder = count % 32;
471+
if (remainder > 0)
472+
{
473+
__mmask32 rem_mask = (1U << remainder) - 1;
474+
__m512i raw = _mm512_maskz_loadu_epi16(rem_mask, src_stream);
475+
__m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
476+
__mmask32 mask = _mm512_mask_cmpneq_epi16_mask(rem_mask, restart, value);
477+
478+
__m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
479+
max = _mm512_mask_max_epu16(max, mask, max, value);
480+
min = _mm512_mask_min_epu16(min, mask, min, value);
481+
__m512i packed = _mm512_maskz_compress_epi16(mask, value_with_max_restart);
482+
483+
int processed = _mm_popcnt_u32(mask);
484+
__mmask32 store_mask = (1U << processed) - 1;
485+
_mm512_mask_storeu_epi16(dst_stream, store_mask, packed);
486+
written += processed;
487+
}
488+
489+
__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
490+
__m256i min2 = _mm512_castsi512_si256(min);
491+
min2 = _mm256_min_epu16(min2, tmp256);
492+
__m128i tmp = _mm256_extracti128_si256(min2, 1);
493+
__m128i min3 = _mm256_castsi256_si128(min2);
494+
min3 = _mm_min_epu16(min3, tmp);
495+
496+
tmp256 = _mm512_extracti64x4_epi64(max, 1);
497+
__m256i max2 = _mm512_castsi512_si256(max);
498+
max2 = _mm256_max_epu16(max2, tmp256);
499+
tmp = _mm256_extracti128_si256(max2, 1);
500+
__m128i max3 = _mm256_castsi256_si128(max2);
501+
max3 = _mm_max_epu16(max3, tmp);
502+
503+
const u16 min_index = sse41_hmin_epu16(min3);
504+
const u16 max_index = sse41_hmax_epu16(max3);
505+
506+
return std::make_tuple(min_index, max_index, written);
507+
}
508+
509+
AVX3_FUNC
510+
static
511+
std::tuple<u32, u32, u32> upload_u32_swapped_avx3_skip_restart(const void *src, void *dst, u32 count, u32 restart_index)
512+
{
513+
const __m512i s_bswap_u32_mask512 = _mm512_broadcast_i32x4(s_bswap_u32_mask);
514+
515+
auto src_stream = static_cast<const __m512i*>(src);
516+
auto dst_stream = static_cast<u32 *>(dst);
517+
518+
__m512i restart = _mm512_set1_epi32(restart_index);
519+
__m512i min = _mm512_set1_epi32(-1);
520+
__m512i max = _mm512_set1_epi32(0);
521+
const __m512i ones = _mm512_set1_epi32(-1);
522+
523+
int written = 0;
524+
525+
const u32 iterations = count / 16;
526+
for (u32 i = 0; i < iterations; i++)
527+
{
528+
const __m512i raw = _mm512_loadu_si512(src_stream++);
529+
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
530+
const __mmask16 mask = _mm512_cmpneq_epi32_mask(restart, value);
531+
const __m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
532+
533+
max = _mm512_mask_max_epu32(max, mask, max, value);
534+
min = _mm512_mask_min_epu32(min, mask, min, value);
535+
const __m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
536+
537+
int processed = _mm_popcnt_u32(mask);
538+
_mm512_storeu_si512(dst_stream, packed);
539+
dst_stream += processed;
540+
written += processed;
541+
}
542+
543+
u32 remainder = count % 16;
544+
if (remainder > 0)
545+
{
546+
__mmask16 rem_mask = (1U << remainder) - 1;
547+
__m512i raw = _mm512_maskz_loadu_epi32(rem_mask, src_stream);
548+
__m512i value = _mm512_shuffle_epi8(raw, s_bswap_u32_mask512);
549+
550+
__mmask16 mask = _mm512_mask_cmpneq_epi32_mask(rem_mask, restart, value);
551+
__m512i value_with_max_restart = _mm512_mask_blend_epi32(mask, ones, value);
552+
max = _mm512_mask_max_epu32(max, mask, max, value);
553+
min = _mm512_mask_min_epu32(min, mask, min, value);
554+
__m512i packed = _mm512_maskz_compress_epi32(mask, value_with_max_restart);
555+
556+
int processed = _mm_popcnt_u32(mask);
557+
__mmask16 store_mask = (1U << processed) - 1;
558+
_mm512_mask_storeu_epi32(dst_stream, store_mask, packed);
559+
written += processed;
560+
}
561+
562+
u32 min_index = _mm512_reduce_min_epu32(min);
563+
u32 max_index = _mm512_reduce_max_epu32(max);
564+
565+
return std::make_tuple(min_index, max_index, written);
566+
}
567+
#endif
568+
407569
template <typename T>
408570
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
409571
{
@@ -412,6 +574,24 @@ NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be
412574
u32 written = 0;
413575
u32 length = ::size32(src);
414576

577+
if constexpr (std::is_same_v<T, u16>)
578+
{
579+
if (s_use_avx512_icl)
580+
{
581+
std::tie(min_index, max_index, written) = upload_u16_swapped_avx512_icl_skip_restart(src.data(), dst.data(), length, restart_index);
582+
return std::make_tuple(min_index, max_index, written);
583+
}
584+
}
585+
586+
if constexpr (std::is_same_v<T, u32>)
587+
{
588+
if (s_use_avx3)
589+
{
590+
std::tie(min_index, max_index, written) = upload_u32_swapped_avx3_skip_restart(src.data(), dst.data(), length, restart_index);
591+
return std::make_tuple(min_index, max_index, written);
592+
}
593+
}
594+
415595
for (u32 i = written; i < length; ++i)
416596
{
417597
T index = src[i];

0 commit comments

Comments
 (0)