-
Notifications
You must be signed in to change notification settings - Fork 15.9k
[AArch64] Avoid selecting XAR for reverse operations. #178706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AArch64] Avoid selecting XAR for reverse operations. #178706
Conversation
Rotations that implement reverse operations, for example:
```c
uint64x2_t foo(uint64x2_t r) {
return (r >> 32) | (r << 32);
}
```
Are currently lowered as XAR (when available):
```gas
foo:
movi v1.2d, #0000000000000000
xar v0.2d, v0.2d, v1.2d, llvm#32
ret
```
This is subobtimal as REV* instructions typically have higher throughput
than XAR, and do not require the zero operand.
This patch avoids selecting XAR when the operation can be represented by
Neon or SVE REV* instructions. It also adds a few patterns for selecting
the latter in cases where they were missing.
https://godbolt.org/z/z9Y6xbr5W
|
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesRotations that implement reverse operations, for example: uint64x2_t foo(uint64x2_t r) {
return (r >> 32) | (r << 32);
}Are currently lowered as XAR (when available): foo:
movi v1.2d, #<!-- -->0000000000000000
xar v0.2d, v0.2d, v1.2d, #<!-- -->32
retThis is suboptimal as REV* instructions typically have higher throughput than XAR and do not require the zero operand. This patch avoids selecting XAR when the operation can be implemented by Neon or SVE REV* instructions. It also adds patterns for selecting the latter in cases where they were missing. https://godbolt.org/z/z9Y6xbr5W I can break the PR into separate ones if preferred. Full diff: https://github.com/llvm/llvm-project/pull/178706.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7b566e432b6bc..b819b3d944a22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4616,7 +4616,11 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
!ISD::isConstantSplatVector(N1.getOperand(2).getNode(), ShrAmt))
return false;
- if (ShlAmt + ShrAmt != VT.getScalarSizeInBits())
+ unsigned VTSizeInBits = VT.getScalarSizeInBits();
+ if (ShlAmt + ShrAmt != VTSizeInBits)
+ return false;
+ if ((VTSizeInBits == 32 || VTSizeInBits == 64) &&
+ VTSizeInBits == ShlAmt * 2)
return false;
if (!IsXOROperand) {
@@ -4700,6 +4704,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
unsigned VTSizeInBits = VT.getScalarSizeInBits();
if (ShAmt + HsAmt != VTSizeInBits)
return false;
+ if ((VTSizeInBits == 16 || VTSizeInBits == 32 || VTSizeInBits == 64) &&
+ VTSizeInBits == ShAmt * 2)
+ return false;
if (!IsXOROperand) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i64);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 96d45d4d48652..2e6df16b136dd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6022,12 +6022,21 @@ def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))),
def : Pat<(v2i64 (or (v2i64 (AArch64vshl (v2i64 V128:$Rn), (i32 32))),
(v2i64 (AArch64vlshr (v2i64 V128:$Rn), (i32 32))))),
(v2i64 (REV64v4i32 (v2i64 V128:$Rn)))>;
+def : Pat<(v1i64 (or (v1i64 (AArch64vshl (v1i64 V64:$Rn), (i32 32))),
+ (v1i64 (AArch64vlshr (v1i64 V64:$Rn), (i32 32))))),
+ (v1i64 (REV64v2i32 (v1i64 V64:$Rn)))>;
def : Pat<(v4i32 (or (v4i32 (AArch64vshl (v4i32 V128:$Rn), (i32 16))),
(v4i32 (AArch64vlshr (v4i32 V128:$Rn), (i32 16))))),
(v4i32 (REV32v8i16 (v4i32 V128:$Rn)))>;
def : Pat<(v2i32 (or (v2i32 (AArch64vshl (v2i32 V64:$Rn), (i32 16))),
(v2i32 (AArch64vlshr (v2i32 V64:$Rn), (i32 16))))),
(v2i32 (REV32v4i16 (v2i32 V64:$Rn)))>;
+def : Pat<(v8i16 (or (v8i16 (AArch64vshl (v8i16 V128:$Rn), (i32 8))),
+ (v8i16 (AArch64vlshr (v8i16 V128:$Rn), (i32 8))))),
+ (v8i16 (REV16v16i8 (v8i16 V128:$Rn)))>;
+def : Pat<(v4i16 (or (v4i16 (AArch64vshl (v4i16 V64:$Rn), (i32 8))),
+ (v4i16 (AArch64vlshr (v4i16 V64:$Rn), (i32 8))))),
+ (v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>;
//===----------------------------------------------------------------------===//
// Advanced SIMD three vector instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dd90bf2622ec3..f73d22ee57595 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1068,6 +1068,14 @@ let Predicates = [HasSVE_or_SME] in {
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>;
defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>;
+ def : Pat<(nxv4i32 (or (nxv4i32 (AArch64lsl_p nxv4i1:$pg, nxv4i32:$op, (nxv4i32 (splat_vector (i32 16))))),
+ (nxv4i32 (AArch64lsr_p nxv4i1:$pg, nxv4i32:$op, (nxv4i32 (splat_vector (i32 16))))))),
+ (REVH_ZPmZ_S (IMPLICIT_DEF), $pg, $op)>;
+
+ def : Pat<(nxv2i64 (or (nxv2i64 (AArch64lsl_p nxv2i1:$pg, nxv2i64:$op, (nxv2i64 (splat_vector (i64 32))))),
+ (nxv2i64 (AArch64lsr_p nxv2i1:$pg, nxv2i64:$op, (nxv2i64 (splat_vector (i64 32))))))),
+ (REVW_ZPmZ_D (IMPLICIT_DEF), $pg, $op)>;
+
defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse, int_aarch64_sve_rev_b16, int_aarch64_sve_rev_b32, int_aarch64_sve_rev_b64>;
defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>;
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 8f6f4510d8388..4989cc19600e9 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -296,6 +296,38 @@ define <vscale x 2 x i64> @xar_nxv2i64_shifts_neg(<vscale x 2 x i64> %x, <vscale
ret <vscale x 2 x i64> %or
}
+; Don't use XAR if REV[BHW] can be used.
+
+define <vscale x 8 x i16> @revb_nxv8i16(<vscale x 8 x i16> %r) #1 {
+; CHECK-LABEL: revb_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
+; CHECK-NEXT: ret
+ %or = tail call <vscale x 8 x i16> @llvm.fshl(<vscale x 8 x i16> %r, <vscale x 8 x i16> %r, <vscale x 8 x i16> splat (i16 8))
+ ret <vscale x 8 x i16> %or
+}
+
+define <vscale x 4 x i32> @revh_nxv4i32(<vscale x 4 x i32> %r) #1 {
+; CHECK-LABEL: revh_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: revh z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %or = tail call <vscale x 4 x i32> @llvm.fshl(<vscale x 4 x i32> %r, <vscale x 4 x i32> %r, <vscale x 4 x i32> splat (i32 16))
+ ret <vscale x 4 x i32> %or
+}
+
+define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) #1 {
+; CHECK-LABEL: revw_nx2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: revw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %or = tail call <vscale x 2 x i64> @llvm.fshl(<vscale x 2 x i64> %r, <vscale x 2 x i64> %r, <vscale x 2 x i64> splat (i64 32))
+ ret <vscale x 2 x i64> %or
+}
+
declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 652617b58eaf3..c2cca5c20bd2a 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=CHECK,SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=CHECK,NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefixes=CHECK,SVE2 %s
/* 128-bit vectors */
@@ -359,6 +359,62 @@ entry:
ret <8 x i8> %or
}
+; Don't use XAR if REV16/REV32/REV64 can be used.
+
+define <4 x i16> @rev16_v4i16(<4 x i16> %r) {
+; CHECK-LABEL: rev16_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: ret
+ %or = tail call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 8))
+ ret <4 x i16> %or
+}
+
+define <2 x i32> @rev32_v2i32(<2 x i32> %r) {
+; CHECK-LABEL: rev32_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.4h, v0.4h
+; CHECK-NEXT: ret
+ %or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16))
+ ret <2 x i32> %or
+}
+
+define <1 x i64> @rev64_v1i64(<1 x i64> %r) {
+; CHECK-LABEL: rev64_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.2s, v0.2s
+; CHECK-NEXT: ret
+ %or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32))
+ ret <1 x i64> %or
+}
+
+define <8 x i16> @rev16_v8i16(<8 x i16> %r) {
+; CHECK-LABEL: rev16_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: ret
+ %or = tail call <8 x i16> @llvm.fshl(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 8))
+ ret <8 x i16> %or
+}
+
+define <4 x i32> @rev32_v4i32(<4 x i32> %r) {
+; CHECK-LABEL: rev32_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.8h, v0.8h
+; CHECK-NEXT: ret
+ %or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16))
+ ret <4 x i32> %or
+}
+
+define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
+; CHECK-LABEL: rev64_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.4s, v0.4s
+; CHECK-NEXT: ret
+ %or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32))
+ ret <2 x i64> %or
+}
+
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
|
| def : Pat<(v8i16 (or (v8i16 (AArch64vshl (v8i16 V128:$Rn), (i32 8))), | ||
| (v8i16 (AArch64vlshr (v8i16 V128:$Rn), (i32 8))))), | ||
| (v8i16 (REV16v16i8 (v8i16 V128:$Rn)))>; | ||
| def : Pat<(v4i16 (or (v4i16 (AArch64vshl (v4i16 V64:$Rn), (i32 8))), | ||
| (v4i16 (AArch64vlshr (v4i16 V64:$Rn), (i32 8))))), | ||
| (v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than adding these, I believe ISD::BSWAP could be made legal for v4i16 / v8i16, which would then allow 8-bit rotations on those types to be combined into bswap, as happens for SVE. Please let me know if that would be preferred.
Rotations that implement reverse operations, for example:
Are currently lowered as XAR (when available):
This is suboptimal as REV* instructions typically have higher throughput than XAR and do not require the zero operand.
This patch avoids selecting XAR when the operation can be implemented by Neon or SVE REV* instructions. It also adds patterns for selecting the latter in cases where they were missing.
https://godbolt.org/z/z9Y6xbr5W
I can break the PR into separate ones if preferred.