Skip to content

Commit 1002bef

Browse files
committed
x86_64: rewrite integer @reduce(.Mul)
1 parent 6098ba5 commit 1002bef

File tree

6 files changed

+3830
-141
lines changed

6 files changed

+3830
-141
lines changed

src/arch/x86_64/CodeGen.zig

Lines changed: 3766 additions & 119 deletions
Large diffs are not rendered by default.

src/arch/x86_64/Encoding.zig

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ pub const Mnemonic = enum {
336336
fcom, fcomi, fcomip, fcomp, fcompp, fcos,
337337
fdecstp, fdiv, fdivp, fdivr, fdivrp, ffree,
338338
fiadd, ficom, ficomp, fidiv, fidivr, fild, fimul, fincstp, finit,
339-
fist, fistp, fisttp, fisub, fisubr,
339+
fist, fistp, fisub, fisubr,
340340
fld, fld1, fldcw, fldenv, fldl2e, fldl2t, fldlg2, fldln2, fldpi, fldz,
341341
fmul, fmulp,
342342
fnclex, fninit, fnop, fnsave, fnstcw, fnstenv, fnstsw,
@@ -349,19 +349,18 @@ pub const Mnemonic = enum {
349349
// MMX
350350
emms, movd, movq,
351351
packssdw, packsswb, packuswb,
352-
paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw,
352+
paddb, paddd, paddsb, paddsw, paddusb, paddusw, paddw,
353353
pand, pandn, por, pxor,
354354
pcmpeqb, pcmpeqd, pcmpeqw,
355355
pcmpgtb, pcmpgtd, pcmpgtw,
356-
pmulhw, pmullw,
356+
pmaddwd, pmulhw, pmullw,
357357
pslld, psllq, psllw,
358358
psrad, psraw,
359359
psrld, psrlq, psrlw,
360-
psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw,
360+
psubb, psubd, psubsb, psubsw, psubusb, psubusw, psubw,
361361
// SSE
362362
addps, addss,
363-
andps,
364-
andnps,
363+
andnps, andps,
365364
cmpps, cmpss, comiss,
366365
cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si,
367366
divps, divss,
@@ -374,9 +373,11 @@ pub const Mnemonic = enum {
374373
movss, movups,
375374
mulps, mulss,
376375
orps,
376+
pavgb, pavgw,
377377
pextrw, pinsrw,
378-
pmaxsw, pmaxub, pminsw, pminub, pmovmskb,
378+
pmaxsw, pmaxub, pminsw, pminub, pmovmskb, pmulhuw,
379379
prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1,
380+
psadbw, pshufw,
380381
shufps,
381382
sqrtps, sqrtss,
382383
stmxcsr,
@@ -397,15 +398,16 @@ pub const Mnemonic = enum {
397398
maxpd, maxsd,
398399
minpd, minsd,
399400
movapd,
400-
movdqa, movdqu,
401+
movdq2q, movdqa, movdqu,
401402
movhpd, movlpd,
402-
movmskpd,
403+
movmskpd, movq2dq,
403404
//movsd,
404405
movupd,
405406
mulpd, mulsd,
406407
orpd,
408+
paddq, pmuludq,
407409
pshufd, pshufhw, pshuflw,
408-
pslldq, psrldq,
410+
pslldq, psrldq, psubq,
409411
punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
410412
punpcklbw, punpckldq, punpcklqdq, punpcklwd,
411413
shufpd,
@@ -414,9 +416,17 @@ pub const Mnemonic = enum {
414416
ucomisd, unpckhpd, unpcklpd,
415417
xorpd,
416418
// SSE3
417-
addsubpd, addsubps, haddpd, haddps, lddqu, movddup, movshdup, movsldup,
419+
addsubpd, addsubps,
420+
fisttp,
421+
haddpd, haddps,
422+
hsubpd, hsubps,
423+
lddqu,
424+
movddup, movshdup, movsldup,
418425
// SSSE3
419-
pabsb, pabsd, pabsw, palignr, pshufb,
426+
pabsb, pabsd, pabsw, palignr,
427+
phaddw, phaddsw, phaddd, phsubw, phsubsw, phsubd,
428+
pmaddubsw, pmulhrsw, pshufb,
429+
psignb, psignd, psignw,
420430
// SSE4.1
421431
blendpd, blendps, blendvpd, blendvps,
422432
dppd, dpps,
@@ -430,7 +440,7 @@ pub const Mnemonic = enum {
430440
pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
431441
pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
432442
pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
433-
pmulld,
443+
pmuldq, pmulld,
434444
ptest,
435445
roundpd, roundps, roundsd, roundss,
436446
// SSE4.2
@@ -458,7 +468,7 @@ pub const Mnemonic = enum {
458468
vdppd, vdpps,
459469
vextractf128, vextractps,
460470
vgf2p8affineinvqb, vgf2p8affineqb, vgf2p8mulb,
461-
vhaddpd, vhaddps,
471+
vhaddpd, vhaddps, vhsubpd, vhsubps,
462472
vinsertf128, vinsertps,
463473
vlddqu, vldmxcsr,
464474
vmaskmovpd, vmaskmovps,
@@ -480,21 +490,24 @@ pub const Mnemonic = enum {
480490
vpabsb, vpabsd, vpabsw,
481491
vpackssdw, vpacksswb, vpackusdw, vpackuswb,
482492
vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
483-
vpalignr, vpand, vpandn,
493+
vpalignr, vpand, vpandn, vpavgb, vpavgw,
484494
vpblendvb, vpblendw, vpclmulqdq,
485495
vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
486496
vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
497+
vphaddw, vphaddsw, vphaddd, vphsubw, vphsubsw, vphsubd,
487498
vperm2f128, vpermilpd, vpermilps,
488499
vpextrb, vpextrd, vpextrq, vpextrw,
489500
vpinsrb, vpinsrd, vpinsrq, vpinsrw,
490501
vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
491502
vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
503+
vpmaddubsw,
492504
vpmovmskb,
493505
vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
494506
vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
495-
vpmulhw, vpmulld, vpmullw,
507+
vpmuldq, vpmulhrsw, vpmulhw, vpmulld, vpmullw, vpmuludq,
496508
vpor,
497509
vpshufb, vpshufd, vpshufhw, vpshuflw,
510+
vpsignb, vpsignd, vpsignw,
498511
vpslld, vpslldq, vpsllq, vpsllw,
499512
vpsrad, vpsraq, vpsraw,
500513
vpsrld, vpsrldq, vpsrlq, vpsrlw,
@@ -779,7 +792,7 @@ pub const Op = enum {
779792
pub fn isImmediate(op: Op) bool {
780793
// zig fmt: off
781794
return switch (op) {
782-
.imm8, .imm16, .imm32, .imm64,
795+
.imm8, .imm16, .imm32, .imm64,
783796
.imm8s, .imm16s, .imm32s,
784797
.rel8, .rel16, .rel32,
785798
.unity,
@@ -986,6 +999,7 @@ pub const Feature = enum {
986999
sse,
9871000
sse2,
9881001
sse3,
1002+
@"sse3 x87",
9891003
sse4_1,
9901004
sse4_2,
9911005
ssse3,

src/arch/x86_64/Lower.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
567567
}
568568

569569
fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
570-
@setEvalBranchQuota(2_500);
570+
@setEvalBranchQuota(2_600);
571571
const fixes = switch (inst.ops) {
572572
.none => inst.data.none.fixes,
573573
.inst => inst.data.inst.fixes,

src/arch/x86_64/Mir.zig

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,10 @@ pub const Inst = struct {
386386
/// Packed ___ Quadword
387387
p_q,
388388
/// Packed ___ Double Quadword
389+
/// Packed ___ Doubleword to Quadword
389390
p_dq,
391+
/// Packed ___ Unsigned Doubleword to Quadword
392+
p_udq,
390393
/// ___ Aligned Packed Integer Values
391394
_dqa,
392395
/// ___ Unaligned Packed Integer Values
@@ -446,7 +449,10 @@ pub const Inst = struct {
446449
/// VEX-Encoded Packed ___ Quadword
447450
vp_q,
448451
/// VEX-Encoded Packed ___ Double Quadword
452+
/// VEX-Encoded Packed ___ Doubleword to Quadword
449453
vp_dq,
454+
/// VEX-Encoded Packed ___ Unsigned Doubleword to Quadword
455+
vp_udq,
450456
/// VEX-Encoded ___ Scalar Single-Precision Values
451457
v_ss,
452458
/// VEX-Encoded ___ Packed Single-Precision Values
@@ -663,6 +669,8 @@ pub const Inst = struct {
663669
/// Multiply scalar single-precision floating-point values
664670
/// Multiply packed double-precision floating-point values
665671
/// Multiply scalar double-precision floating-point values
672+
/// Multiply packed unsigned doubleword integers
673+
/// Multiply packed doubleword integers
666674
mul,
667675
/// Two's complement negation
668676
neg,

src/arch/x86_64/encodings.zon

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,10 +1160,6 @@
11601160
.{ .fistp, .m, .{ .m32 }, .{ 0xdb }, 3, .none, .x87 },
11611161
.{ .fistp, .m, .{ .m64 }, .{ 0xdf }, 7, .none, .x87 },
11621162

1163-
.{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .x87 },
1164-
.{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .x87 },
1165-
.{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .x87 },
1166-
11671163
.{ .fld, .m, .{ .m32 }, .{ 0xd9 }, 0, .none, .x87 },
11681164
.{ .fld, .m, .{ .m64 }, .{ 0xdd }, 0, .none, .x87 },
11691165
.{ .fld, .m, .{ .m80 }, .{ 0xdb }, 5, .none, .x87 },
@@ -1540,6 +1536,8 @@
15401536

15411537
.{ .pmullw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 },
15421538

1539+
.{ .pmuludq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .none, .sse2 },
1540+
15431541
.{ .por, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 },
15441542

15451543
.{ .pshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .none, .sse2 },
@@ -1618,6 +1616,10 @@
16181616

16191617
.{ .addsubps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0xd0 }, 0, .none, .sse3 },
16201618

1619+
.{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .@"sse3 x87" },
1620+
.{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .@"sse3 x87" },
1621+
.{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .@"sse3 x87" },
1622+
16211623
.{ .haddpd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x7c }, 0, .none, .sse3 },
16221624

16231625
.{ .haddps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0x7c }, 0, .none, .sse3 },
@@ -1708,6 +1710,8 @@
17081710
.{ .pmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 },
17091711
.{ .pmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 },
17101712

1713+
.{ .pmuldq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .none, .sse4_1 },
1714+
17111715
.{ .pmulld, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },
17121716

17131717
.{ .ptest, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x17 }, 0, .none, .sse4_1 },
@@ -2166,12 +2170,16 @@
21662170
.{ .vpmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx },
21672171
.{ .vpmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx },
21682172

2173+
.{ .vpmuldq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_128_wig, .avx },
2174+
21692175
.{ .vpmulhw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },
21702176

21712177
.{ .vpmulld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },
21722178

21732179
.{ .vpmullw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_128_wig, .avx },
21742180

2181+
.{ .vpmuludq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_128_wig, .avx },
2182+
21752183
.{ .vpor, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },
21762184

21772185
.{ .vpshufb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },
@@ -2493,12 +2501,16 @@
24932501
.{ .vpmovzxwq, .rm, .{ .ymm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 },
24942502
.{ .vpmovzxdq, .rm, .{ .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 },
24952503

2504+
.{ .vpmuldq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_256_wig, .avx2 },
2505+
24962506
.{ .vpmulhw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },
24972507

24982508
.{ .vpmulld, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },
24992509

25002510
.{ .vpmullw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_256_wig, .avx2 },
25012511

2512+
.{ .vpmuludq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_256_wig, .avx2 },
2513+
25022514
.{ .vpor, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },
25032515

25042516
.{ .vpshufb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },

test/behavior/x86_64/unary.zig

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4889,6 +4889,14 @@ test reduceAdd {
48894889
try test_reduce_add.testIntVectors();
48904890
}
48914891

4892+
inline fn reduceMul(comptime Type: type, rhs: Type) @typeInfo(Type).vector.child {
4893+
return @reduce(.Mul, rhs);
4894+
}
4895+
test reduceMul {
4896+
const test_reduce_mul = unary(reduceMul, .{});
4897+
try test_reduce_mul.testIntVectors();
4898+
}
4899+
48924900
inline fn splat(comptime Type: type, rhs: Type) Type {
48934901
return @splat(rhs[0]);
48944902
}

0 commit comments

Comments
 (0)