@@ -816,6 +816,42 @@ struct VECTOR_SHL_V128
816
816
}
817
817
e.lea (e.GetNativeParam (1 ), e.StashConstantXmm (1 , i.src2 .constant ()));
818
818
} else {
819
+ if (e.IsFeatureEnabled (kX64EmitGFNI | kX64EmitAVX512Ortho |
820
+ kX64EmitAVX512VBMI )) {
821
+ // gf2p8mulb's "x8 + x4 + x3 + x + 1"-polynomial-reduction only
822
+ // applies when the multiplication overflows. Masking away any bits
823
+ // that would have overflowed turns the polynomial-multiplication into
824
+ // regular modulo-multiplication
825
+ const uint64_t shift_mask = UINT64_C (0x01'03'07'0f'1f'3f'7f'ff );
826
+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
827
+ e.vpermb (e.xmm0 , i.src2 , e.xmm0 );
828
+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
829
+
830
+ // n << 0 == n * 1 | n << 1 == n * 2 | n << 2 == n * 4 | etc
831
+ const uint64_t multiply_table = UINT64_C (0x80'40'20'10'08'04'02'01 );
832
+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
833
+ e.vpermb (e.xmm1 , i.src2 , e.xmm1 );
834
+
835
+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
836
+ return ;
837
+ } else if (e.IsFeatureEnabled (kX64EmitGFNI )) {
838
+ // Only use the lower 4 bits
839
+ // This also protects from vpshufb from writing zero when the MSB is set
840
+ e.LoadConstantXmm (e.xmm0 , vec128b (0x0F ));
841
+ e.vpand (e.xmm2 , i.src2 , e.xmm0 );
842
+
843
+ const uint64_t shift_mask = UINT64_C (0x01'03'07'0f'1f'3f'7f'ff );
844
+ e.LoadConstantXmm (e.xmm0 , vec128q (shift_mask, shift_mask));
845
+ e.vpshufb (e.xmm0 , e.xmm0 , e.xmm2 );
846
+ e.vpand (e.xmm0 , i.src1 , e.xmm0 );
847
+
848
+ const uint64_t multiply_table = UINT64_C (0x80'40'20'10'08'04'02'01 );
849
+ e.LoadConstantXmm (e.xmm1 , vec128q (multiply_table, multiply_table));
850
+ e.vpshufb (e.xmm1 , e.xmm1 , e.xmm2 );
851
+
852
+ e.vgf2p8mulb (i.dest , e.xmm0 , e.xmm1 );
853
+ return ;
854
+ }
819
855
e.lea (e.GetNativeParam (1 ), e.StashXmm (1 , i.src2 ));
820
856
}
821
857
e.lea (e.GetNativeParam (0 ), e.StashXmm (0 , i.src1 ));
0 commit comments