Skip to content

Commit 47357b3

Browse files
committed
[x64] Add GFNI and AVX512VBMI optimization for VECTOR_SHL(int8)
Uses `{v}gf2p8mulb` as a general int8-multiplication instruction to simulate bit-shifts.
1 parent 887fda5 commit 47357b3

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

src/xenia/cpu/backend/x64/x64_seq_vector.cc

+36
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,42 @@ struct VECTOR_SHL_V128
816816
}
817817
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
818818
} else {
819+
if (e.IsFeatureEnabled(kX64EmitGFNI | kX64EmitAVX512Ortho |
820+
kX64EmitAVX512VBMI)) {
821+
// gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only
822+
// applies when the multiplication overflows. Masking away any bits
823+
// that would have overflowed turns the polynomial-multiplication into
824+
// regular modulo-multiplication
825+
const uint64_t shift_mask = UINT64_C(0x01'03'07'0f'1f'3f'7f'ff);
826+
e.LoadConstantXmm(e.xmm0, vec128q(shift_mask, shift_mask));
827+
e.vpermb(e.xmm0, i.src2, e.xmm0);
828+
e.vpand(e.xmm0, i.src1, e.xmm0);
829+
830+
// n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc
831+
const uint64_t multiply_table = UINT64_C(0x80'40'20'10'08'04'02'01);
832+
e.LoadConstantXmm(e.xmm1, vec128q(multiply_table, multiply_table));
833+
e.vpermb(e.xmm1, i.src2, e.xmm1);
834+
835+
e.vgf2p8mulb(i.dest, e.xmm0, e.xmm1);
836+
return;
837+
} else if (e.IsFeatureEnabled(kX64EmitGFNI)) {
838+
// Only use the lower 4 bits
839+
// This also protects from vpshufb from writing zero when the MSB is set
840+
e.LoadConstantXmm(e.xmm0, vec128b(0x0F));
841+
e.vpand(e.xmm2, i.src2, e.xmm0);
842+
843+
const uint64_t shift_mask = UINT64_C(0x01'03'07'0f'1f'3f'7f'ff);
844+
e.LoadConstantXmm(e.xmm0, vec128q(shift_mask, shift_mask));
845+
e.vpshufb(e.xmm0, e.xmm0, e.xmm2);
846+
e.vpand(e.xmm0, i.src1, e.xmm0);
847+
848+
const uint64_t multiply_table = UINT64_C(0x80'40'20'10'08'04'02'01);
849+
e.LoadConstantXmm(e.xmm1, vec128q(multiply_table, multiply_table));
850+
e.vpshufb(e.xmm1, e.xmm1, e.xmm2);
851+
852+
e.vgf2p8mulb(i.dest, e.xmm0, e.xmm1);
853+
return;
854+
}
819855
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
820856
}
821857
e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1));

0 commit comments

Comments
 (0)