Skip to content

Commit 36fa27f

Browse files
authored
[AArch64][GlobalISel] Add patterns for scalar sqdmlal/sqdmlsl (llvm#187246)
SQMLAL's instruction selection patterns don't work for GlobalISel when the intrinsic has scalar operands. This is because the intrinsic has a slightly different name (int_aarch64_neon_sqdmulls_scalar). As a result, this leads to sub-optimal code generation. This patch allows sqdmulls_scalar to lower, and adds GlobalISel versions of the TableGen patterns to provide this optimisation. The pattern added performs this mapping: `SQADD(a, SQDMULL(b,c)) -> SQDMLAL(a, b, c) [And equivalent for subtraction]`
1 parent b914982 commit 36fa27f

File tree

4 files changed

+133
-42
lines changed

4 files changed

+133
-42
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6621,14 +6621,26 @@ defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
66216621
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
66226622
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
66236623

6624+
def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
6625+
(i32 FPR32:$Rm)),
6626+
(SQDMULLi32 FPR32:$Rn, FPR32:$Rm)>;
6627+
66246628
def : Pat<(f64 (AArch64sqadd FPR64:$Rd,
66256629
(AArch64sqdmull FPR32:$Rn, FPR32:$Rm))),
66266630
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
66276631

6632+
def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
6633+
(int_aarch64_neon_sqdmulls_scalar FPR32:$Rn, FPR32:$Rm))),
6634+
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
6635+
66286636
def : Pat<(f64 (AArch64sqsub FPR64:$Rd,
66296637
(AArch64sqdmull FPR32:$Rn, FPR32:$Rm))),
66306638
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
66316639

6640+
def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
6641+
(int_aarch64_neon_sqdmulls_scalar FPR32:$Rn, FPR32:$Rm))),
6642+
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
6643+
66326644
//===----------------------------------------------------------------------===//
66336645
// Advanced SIMD two scalar instructions.
66346646
//===----------------------------------------------------------------------===//
@@ -9042,6 +9054,18 @@ def : Pat<(f64 (AArch64sqdmull FPR32:$Rn,
90429054
VectorIndexS:$idx)))))),
90439055
(SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
90449056

9057+
def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
9058+
(int_aarch64_neon_sqdmulls_scalar FPR32:$Rn,
9059+
(vector_extract (v4i32 V128:$Vm),
9060+
VectorIndexS:$idx)))),
9061+
(SQDMLALv1i64_indexed FPR64:$Rd, FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
9062+
9063+
def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
9064+
(int_aarch64_neon_sqdmulls_scalar FPR32:$Rn,
9065+
(vector_extract (v4i32 V128:$Vm),
9066+
VectorIndexS:$idx)))),
9067+
(SQDMLSLv1i64_indexed FPR64:$Rd, FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
9068+
90459069
//----------------------------------------------------------------------------
90469070
// AdvSIMD scalar shift instructions
90479071
//----------------------------------------------------------------------------

llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
625625
case Intrinsic::aarch64_neon_sqadd:
626626
case Intrinsic::aarch64_neon_uqsub:
627627
case Intrinsic::aarch64_neon_sqsub:
628+
case Intrinsic::aarch64_neon_sqdmulls_scalar:
628629
case Intrinsic::aarch64_neon_srshl:
629630
case Intrinsic::aarch64_neon_urshl:
630631
case Intrinsic::aarch64_neon_sqshl:

llvm/test/CodeGen/AArch64/arm64-int-neon.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
33
; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

5-
; CHECK-GI: warning: Instruction selection used fallback path for test_sqdmulls_scalar
6-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqdmulh_scalar
5+
; CHECK-GI: warning: Instruction selection used fallback path for test_sqdmulh_scalar
76
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqabs_s32
87
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqabs_s64
98
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqneg_s32

llvm/test/CodeGen/AArch64/arm64-vmul.ll

Lines changed: 107 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
2020
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
2121
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
22-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vqdmulls_lane_s32
23-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
24-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
2522
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
2623
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
2724
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
@@ -30,8 +27,6 @@
3027
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
3128
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
3229
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
33-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
34-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
3530

3631
define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
3732
; CHECK-LABEL: smull8h:
@@ -1794,52 +1789,106 @@ define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
17941789
}
17951790

17961791
define i64 @test_vqdmulls_lane_s32(i32 noundef %a, <2 x i32> noundef %b) {
1797-
; CHECK-LABEL: test_vqdmulls_lane_s32:
1798-
; CHECK: // %bb.0: // %entry
1799-
; CHECK-NEXT: fmov s1, w0
1800-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1801-
; CHECK-NEXT: sqdmull d0, s1, v0.s[1]
1802-
; CHECK-NEXT: fmov x0, d0
1803-
; CHECK-NEXT: ret
1792+
; CHECK-SD-LABEL: test_vqdmulls_lane_s32:
1793+
; CHECK-SD: // %bb.0: // %entry
1794+
; CHECK-SD-NEXT: fmov s1, w0
1795+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1796+
; CHECK-SD-NEXT: sqdmull d0, s1, v0.s[1]
1797+
; CHECK-SD-NEXT: fmov x0, d0
1798+
; CHECK-SD-NEXT: ret
1799+
;
1800+
; CHECK-GI-LABEL: test_vqdmulls_lane_s32:
1801+
; CHECK-GI: // %bb.0: // %entry
1802+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1803+
; CHECK-GI-NEXT: fmov s1, w0
1804+
; CHECK-GI-NEXT: mov s0, v0.s[1]
1805+
; CHECK-GI-NEXT: sqdmull d0, s1, s0
1806+
; CHECK-GI-NEXT: fmov x0, d0
1807+
; CHECK-GI-NEXT: ret
18041808
entry:
18051809
%vget_lane = extractelement <2 x i32> %b, i64 1
18061810
%vqdmulls_s32.i = tail call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %vget_lane)
18071811
ret i64 %vqdmulls_s32.i
18081812
}
18091813

1810-
define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1811-
; CHECK-LABEL: sqdmlal_lane_1d:
1814+
define i64 @sqdmlal_lane_1d_v2i32(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1815+
; CHECK-SD-LABEL: sqdmlal_lane_1d_v2i32:
1816+
; CHECK-SD: // %bb.0:
1817+
; CHECK-SD-NEXT: fmov s1, w1
1818+
; CHECK-SD-NEXT: fmov d2, x0
1819+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1820+
; CHECK-SD-NEXT: sqdmlal d2, s1, v0.s[1]
1821+
; CHECK-SD-NEXT: fmov x0, d2
1822+
; CHECK-SD-NEXT: ret
1823+
;
1824+
; CHECK-GI-LABEL: sqdmlal_lane_1d_v2i32:
1825+
; CHECK-GI: // %bb.0:
1826+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1827+
; CHECK-GI-NEXT: fmov s1, w1
1828+
; CHECK-GI-NEXT: fmov d2, x0
1829+
; CHECK-GI-NEXT: mov s0, v0.s[1]
1830+
; CHECK-GI-NEXT: sqdmlal d2, s1, s0
1831+
; CHECK-GI-NEXT: fmov x0, d2
1832+
; CHECK-GI-NEXT: ret
1833+
%rhs = extractelement <2 x i32> %C, i32 1
1834+
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1835+
%res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1836+
ret i64 %res
1837+
}
1838+
1839+
define i64 @sqdmlsl_lane_1d_v2i32(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1840+
; CHECK-SD-LABEL: sqdmlsl_lane_1d_v2i32:
1841+
; CHECK-SD: // %bb.0:
1842+
; CHECK-SD-NEXT: fmov s1, w1
1843+
; CHECK-SD-NEXT: fmov d2, x0
1844+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1845+
; CHECK-SD-NEXT: sqdmlsl d2, s1, v0.s[1]
1846+
; CHECK-SD-NEXT: fmov x0, d2
1847+
; CHECK-SD-NEXT: ret
1848+
;
1849+
; CHECK-GI-LABEL: sqdmlsl_lane_1d_v2i32:
1850+
; CHECK-GI: // %bb.0:
1851+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
1852+
; CHECK-GI-NEXT: fmov s1, w1
1853+
; CHECK-GI-NEXT: fmov d2, x0
1854+
; CHECK-GI-NEXT: mov s0, v0.s[1]
1855+
; CHECK-GI-NEXT: sqdmlsl d2, s1, s0
1856+
; CHECK-GI-NEXT: fmov x0, d2
1857+
; CHECK-GI-NEXT: ret
1858+
%rhs = extractelement <2 x i32> %C, i32 1
1859+
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1860+
%res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1861+
ret i64 %res
1862+
}
1863+
declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1864+
1865+
define i64 @sqdmlal_lane_1d_v4i32(i64 %A, i32 %B, <4 x i32> %C) nounwind {
1866+
; CHECK-LABEL: sqdmlal_lane_1d_v4i32:
18121867
; CHECK: // %bb.0:
18131868
; CHECK-NEXT: fmov s1, w1
18141869
; CHECK-NEXT: fmov d2, x0
1815-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
18161870
; CHECK-NEXT: sqdmlal d2, s1, v0.s[1]
18171871
; CHECK-NEXT: fmov x0, d2
18181872
; CHECK-NEXT: ret
1819-
%rhs = extractelement <2 x i32> %C, i32 1
1873+
%rhs = extractelement <4 x i32> %C, i32 1
18201874
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
18211875
%res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
18221876
ret i64 %res
18231877
}
1824-
declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1825-
declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
18261878

1827-
define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1828-
; CHECK-LABEL: sqdmlsl_lane_1d:
1879+
define i64 @sqdmlsl_lane_1d_v4i32(i64 %A, i32 %B, <4 x i32> %C) nounwind {
1880+
; CHECK-LABEL: sqdmlsl_lane_1d_v4i32:
18291881
; CHECK: // %bb.0:
18301882
; CHECK-NEXT: fmov s1, w1
18311883
; CHECK-NEXT: fmov d2, x0
1832-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
18331884
; CHECK-NEXT: sqdmlsl d2, s1, v0.s[1]
18341885
; CHECK-NEXT: fmov x0, d2
18351886
; CHECK-NEXT: ret
1836-
%rhs = extractelement <2 x i32> %C, i32 1
1887+
%rhs = extractelement <4 x i32> %C, i32 1
18371888
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
18381889
%res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
18391890
ret i64 %res
18401891
}
1841-
declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1842-
18431892

18441893
define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
18451894
; CHECK-LABEL: umlal_lane_4s:
@@ -3216,14 +3265,23 @@ define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
32163265
}
32173266

32183267
define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
3219-
; CHECK-LABEL: sqdmlal_d:
3220-
; CHECK: // %bb.0:
3221-
; CHECK-NEXT: fmov s0, w1
3222-
; CHECK-NEXT: fmov s1, w0
3223-
; CHECK-NEXT: fmov d2, x2
3224-
; CHECK-NEXT: sqdmlal d2, s1, s0
3225-
; CHECK-NEXT: fmov x0, d2
3226-
; CHECK-NEXT: ret
3268+
; CHECK-SD-LABEL: sqdmlal_d:
3269+
; CHECK-SD: // %bb.0:
3270+
; CHECK-SD-NEXT: fmov s0, w1
3271+
; CHECK-SD-NEXT: fmov s1, w0
3272+
; CHECK-SD-NEXT: fmov d2, x2
3273+
; CHECK-SD-NEXT: sqdmlal d2, s1, s0
3274+
; CHECK-SD-NEXT: fmov x0, d2
3275+
; CHECK-SD-NEXT: ret
3276+
;
3277+
; CHECK-GI-LABEL: sqdmlal_d:
3278+
; CHECK-GI: // %bb.0:
3279+
; CHECK-GI-NEXT: fmov s0, w0
3280+
; CHECK-GI-NEXT: fmov s1, w1
3281+
; CHECK-GI-NEXT: fmov d2, x2
3282+
; CHECK-GI-NEXT: sqdmlal d2, s0, s1
3283+
; CHECK-GI-NEXT: fmov x0, d2
3284+
; CHECK-GI-NEXT: ret
32273285
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
32283286
%tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
32293287
ret i64 %tmp5
@@ -3256,14 +3314,23 @@ define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
32563314
}
32573315

32583316
define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
3259-
; CHECK-LABEL: sqdmlsl_d:
3260-
; CHECK: // %bb.0:
3261-
; CHECK-NEXT: fmov s0, w1
3262-
; CHECK-NEXT: fmov s1, w0
3263-
; CHECK-NEXT: fmov d2, x2
3264-
; CHECK-NEXT: sqdmlsl d2, s1, s0
3265-
; CHECK-NEXT: fmov x0, d2
3266-
; CHECK-NEXT: ret
3317+
; CHECK-SD-LABEL: sqdmlsl_d:
3318+
; CHECK-SD: // %bb.0:
3319+
; CHECK-SD-NEXT: fmov s0, w1
3320+
; CHECK-SD-NEXT: fmov s1, w0
3321+
; CHECK-SD-NEXT: fmov d2, x2
3322+
; CHECK-SD-NEXT: sqdmlsl d2, s1, s0
3323+
; CHECK-SD-NEXT: fmov x0, d2
3324+
; CHECK-SD-NEXT: ret
3325+
;
3326+
; CHECK-GI-LABEL: sqdmlsl_d:
3327+
; CHECK-GI: // %bb.0:
3328+
; CHECK-GI-NEXT: fmov s0, w0
3329+
; CHECK-GI-NEXT: fmov s1, w1
3330+
; CHECK-GI-NEXT: fmov d2, x2
3331+
; CHECK-GI-NEXT: sqdmlsl d2, s0, s1
3332+
; CHECK-GI-NEXT: fmov x0, d2
3333+
; CHECK-GI-NEXT: ret
32673334
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
32683335
%tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
32693336
ret i64 %tmp5

0 commit comments

Comments
 (0)