Skip to content

Commit 935ef0a

Browse files
tangaacvar-const
authored andcommitted
[LoongArch] Lower vector shuffle as byte rotate (if possible) (llvm#135157)
1 parent c6ce452 commit 935ef0a

File tree

3 files changed

+203
-82
lines changed

3 files changed

+203
-82
lines changed

Diff for: llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

+137
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,139 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
714714
}
715715
}
716716

717+
/// Test whether a shuffle mask is equivalent within each sub-lane.
718+
///
719+
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
720+
/// non-trivial to compute in the face of undef lanes. The representation is
721+
/// suitable for use with existing 128-bit shuffles as entries from the second
722+
/// vector have been remapped to [LaneSize, 2*LaneSize).
723+
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
724+
ArrayRef<int> Mask,
725+
SmallVectorImpl<int> &RepeatedMask) {
726+
auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
727+
RepeatedMask.assign(LaneSize, -1);
728+
int Size = Mask.size();
729+
for (int i = 0; i < Size; ++i) {
730+
assert(Mask[i] == -1 || Mask[i] >= 0);
731+
if (Mask[i] < 0)
732+
continue;
733+
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
734+
// This entry crosses lanes, so there is no way to model this shuffle.
735+
return false;
736+
737+
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
738+
// Adjust second vector indices to start at LaneSize instead of Size.
739+
int LocalM =
740+
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
741+
if (RepeatedMask[i % LaneSize] < 0)
742+
// This is the first non-undef entry in this slot of a 128-bit lane.
743+
RepeatedMask[i % LaneSize] = LocalM;
744+
else if (RepeatedMask[i % LaneSize] != LocalM)
745+
// Found a mismatch with the repeated mask.
746+
return false;
747+
}
748+
return true;
749+
}
750+
751+
/// Attempts to match vector shuffle as byte rotation.
752+
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
753+
ArrayRef<int> Mask) {
754+
755+
SDValue Lo, Hi;
756+
SmallVector<int, 16> RepeatedMask;
757+
758+
if (!isRepeatedShuffleMask(128, VT, Mask, RepeatedMask))
759+
return -1;
760+
761+
int NumElts = RepeatedMask.size();
762+
int Rotation = 0;
763+
int Scale = 16 / NumElts;
764+
765+
for (int i = 0; i < NumElts; ++i) {
766+
int M = RepeatedMask[i];
767+
assert((M == -1 || (0 <= M && M < (2 * NumElts))) &&
768+
"Unexpected mask index.");
769+
if (M < 0)
770+
continue;
771+
772+
// Determine where a rotated vector would have started.
773+
int StartIdx = i - (M % NumElts);
774+
if (StartIdx == 0)
775+
return -1;
776+
777+
// If we found the tail of a vector the rotation must be the missing
778+
// front. If we found the head of a vector, it must be how much of the
779+
// head.
780+
int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
781+
782+
if (Rotation == 0)
783+
Rotation = CandidateRotation;
784+
else if (Rotation != CandidateRotation)
785+
return -1;
786+
787+
// Compute which value this mask is pointing at.
788+
SDValue MaskV = M < NumElts ? V1 : V2;
789+
790+
// Compute which of the two target values this index should be assigned
791+
// to. This reflects whether the high elements are remaining or the low
792+
// elements are remaining.
793+
SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
794+
795+
// Either set up this value if we've not encountered it before, or check
796+
// that it remains consistent.
797+
if (!TargetV)
798+
TargetV = MaskV;
799+
else if (TargetV != MaskV)
800+
return -1;
801+
}
802+
803+
// Check that we successfully analyzed the mask, and normalize the results.
804+
assert(Rotation != 0 && "Failed to locate a viable rotation!");
805+
assert((Lo || Hi) && "Failed to find a rotated input vector!");
806+
if (!Lo)
807+
Lo = Hi;
808+
else if (!Hi)
809+
Hi = Lo;
810+
811+
V1 = Lo;
812+
V2 = Hi;
813+
814+
return Rotation * Scale;
815+
}
816+
817+
/// Lower VECTOR_SHUFFLE as byte rotate (if possible).
818+
///
819+
/// For example:
820+
/// %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b,
821+
/// <2 x i32> <i32 3, i32 0>
822+
/// is lowered to:
823+
/// (VBSRL_V $v1, $v1, 8)
824+
/// (VBSLL_V $v0, $v0, 8)
825+
/// (VOR_V $v0, $V0, $v1)
826+
static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL,
827+
ArrayRef<int> Mask, MVT VT,
828+
SDValue V1, SDValue V2,
829+
SelectionDAG &DAG) {
830+
831+
SDValue Lo = V1, Hi = V2;
832+
int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
833+
if (ByteRotation <= 0)
834+
return SDValue();
835+
836+
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
837+
Lo = DAG.getBitcast(ByteVT, Lo);
838+
Hi = DAG.getBitcast(ByteVT, Hi);
839+
840+
int LoByteShift = 16 - ByteRotation;
841+
int HiByteShift = ByteRotation;
842+
843+
SDValue LoShift = DAG.getNode(LoongArchISD::VBSLL, DL, ByteVT, Lo,
844+
DAG.getConstant(LoByteShift, DL, MVT::i64));
845+
SDValue HiShift = DAG.getNode(LoongArchISD::VBSRL, DL, ByteVT, Hi,
846+
DAG.getConstant(HiByteShift, DL, MVT::i64));
847+
return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LoShift, HiShift));
848+
}
849+
717850
/// Lower VECTOR_SHUFFLE as ZERO_EXTEND Or ANY_EXTEND (if possible).
718851
///
719852
/// For example:
@@ -1230,6 +1363,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
12301363
if ((Result =
12311364
lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable)))
12321365
return Result;
1366+
if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG)))
1367+
return Result;
12331368
if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
12341369
return Result;
12351370
return SDValue();
@@ -1666,6 +1801,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16661801
if ((Result =
16671802
lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable)))
16681803
return Result;
1804+
if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG)))
1805+
return Result;
16691806
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
16701807
return Result;
16711808

Diff for: llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll

+33-41
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
define <32 x i8> @byte_rotate_v32_i8_1(<32 x i8> %a, <32 x i8> %b) {
77
; CHECK-LABEL: byte_rotate_v32_i8_1:
88
; CHECK: # %bb.0:
9-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0)
10-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
11-
; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2
9+
; CHECK-NEXT: xvbsrl.v $xr0, $xr0, 1
10+
; CHECK-NEXT: xvbsll.v $xr1, $xr1, 15
11+
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
1212
; CHECK-NEXT: ret
1313
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
1414
ret <32 x i8> %shuffle
@@ -17,9 +17,9 @@ define <32 x i8> @byte_rotate_v32_i8_1(<32 x i8> %a, <32 x i8> %b) {
1717
define <32 x i8> @byte_rotate_v32_i8_2(<32 x i8> %a, <32 x i8> %b) {
1818
; CHECK-LABEL: byte_rotate_v32_i8_2:
1919
; CHECK: # %bb.0:
20-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
21-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI1_0)
22-
; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2
20+
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 13
21+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 3
22+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
2323
; CHECK-NEXT: ret
2424
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 45, i32 46, i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 61, i32 62, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
2525
ret <32 x i8> %shuffle
@@ -28,9 +28,9 @@ define <32 x i8> @byte_rotate_v32_i8_2(<32 x i8> %a, <32 x i8> %b) {
2828
define <32 x i8> @byte_rotate_v32_i8_3(<32 x i8> %a) {
2929
; CHECK-LABEL: byte_rotate_v32_i8_3:
3030
; CHECK: # %bb.0:
31-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
32-
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
33-
; CHECK-NEXT: xvshuf.b $xr0, $xr0, $xr0, $xr1
31+
; CHECK-NEXT: xvbsrl.v $xr1, $xr0, 1
32+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 15
33+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
3434
; CHECK-NEXT: ret
3535
%shuffle = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16>
3636
ret <32 x i8> %shuffle
@@ -40,10 +40,9 @@ define <32 x i8> @byte_rotate_v32_i8_3(<32 x i8> %a) {
4040
define <16 x i16> @byte_rotate_v16i16_1(<16 x i16> %a, <16 x i16> %b) nounwind {
4141
; CHECK-LABEL: byte_rotate_v16i16_1:
4242
; CHECK: # %bb.0:
43-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
44-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0)
45-
; CHECK-NEXT: xvshuf.h $xr2, $xr1, $xr0
46-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
43+
; CHECK-NEXT: xvbsrl.v $xr0, $xr0, 6
44+
; CHECK-NEXT: xvbsll.v $xr1, $xr1, 10
45+
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
4746
; CHECK-NEXT: ret
4847
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26>
4948
ret <16 x i16> %shuffle
@@ -52,10 +51,9 @@ define <16 x i16> @byte_rotate_v16i16_1(<16 x i16> %a, <16 x i16> %b) nounwind {
5251
define <16 x i16> @byte_rotate_v16i16_2(<16 x i16> %a, <16 x i16> %b) nounwind {
5352
; CHECK-LABEL: byte_rotate_v16i16_2:
5453
; CHECK: # %bb.0:
55-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
56-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI4_0)
57-
; CHECK-NEXT: xvshuf.h $xr2, $xr1, $xr0
58-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
54+
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 10
55+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 6
56+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
5957
; CHECK-NEXT: ret
6058
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 21, i32 22, i32 23, i32 0, i32 1, i32 2,i32 3, i32 4, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12>
6159
ret <16 x i16> %shuffle
@@ -64,10 +62,9 @@ define <16 x i16> @byte_rotate_v16i16_2(<16 x i16> %a, <16 x i16> %b) nounwind {
6462
define <16 x i16> @byte_rotate_v16i16_3(<16 x i16> %a) nounwind {
6563
; CHECK-LABEL: byte_rotate_v16i16_3:
6664
; CHECK: # %bb.0:
67-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
68-
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI5_0)
69-
; CHECK-NEXT: xvshuf.h $xr1, $xr0, $xr0
70-
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
65+
; CHECK-NEXT: xvbsrl.v $xr1, $xr0, 6
66+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 10
67+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
7168
; CHECK-NEXT: ret
7269
%shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10>
7370
ret <16 x i16> %shuffle
@@ -76,10 +73,9 @@ define <16 x i16> @byte_rotate_v16i16_3(<16 x i16> %a) nounwind {
7673
define <8 x i32> @byte_rotate_v8i32_1(<8 x i32> %a, <8 x i32> %b) nounwind {
7774
; CHECK-LABEL: byte_rotate_v8i32_1:
7875
; CHECK: # %bb.0:
79-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
80-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI6_0)
81-
; CHECK-NEXT: xvshuf.w $xr2, $xr1, $xr0
82-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
76+
; CHECK-NEXT: xvbsrl.v $xr0, $xr0, 4
77+
; CHECK-NEXT: xvbsll.v $xr1, $xr1, 12
78+
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
8379
; CHECK-NEXT: ret
8480
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
8581
ret <8 x i32> %shuffle
@@ -88,10 +84,9 @@ define <8 x i32> @byte_rotate_v8i32_1(<8 x i32> %a, <8 x i32> %b) nounwind {
8884
define <8 x i32> @byte_rotate_v8i32_2(<8 x i32> %a, <8 x i32> %b) nounwind {
8985
; CHECK-LABEL: byte_rotate_v8i32_2:
9086
; CHECK: # %bb.0:
91-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
92-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI7_0)
93-
; CHECK-NEXT: xvshuf.w $xr2, $xr1, $xr0
94-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
87+
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 12
88+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 4
89+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
9590
; CHECK-NEXT: ret
9691
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6>
9792
ret <8 x i32> %shuffle
@@ -109,10 +104,9 @@ define <8 x i32> @byte_rotate_v8i32_3(<8 x i32> %a) nounwind {
109104
define <4 x i64> @byte_rotate_v4i64_1(<4 x i64> %a, <4 x i64> %b) nounwind {
110105
; CHECK-LABEL: byte_rotate_v4i64_1:
111106
; CHECK: # %bb.0:
112-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI9_0)
113-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI9_0)
114-
; CHECK-NEXT: xvshuf.d $xr2, $xr1, $xr0
115-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
107+
; CHECK-NEXT: xvbsrl.v $xr0, $xr0, 8
108+
; CHECK-NEXT: xvbsll.v $xr1, $xr1, 8
109+
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
116110
; CHECK-NEXT: ret
117111
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
118112
ret <4 x i64> %shuffle
@@ -121,10 +115,9 @@ define <4 x i64> @byte_rotate_v4i64_1(<4 x i64> %a, <4 x i64> %b) nounwind {
121115
define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind {
122116
; CHECK-LABEL: byte_rotate_v4i64_2:
123117
; CHECK: # %bb.0:
124-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
125-
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI10_0)
126-
; CHECK-NEXT: xvshuf.d $xr2, $xr1, $xr0
127-
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
118+
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8
119+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 8
120+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
128121
; CHECK-NEXT: ret
129122
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 0, i32 7, i32 2>
130123
ret <4 x i64> %shuffle
@@ -133,10 +126,9 @@ define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind {
133126
define <4 x i64> @byte_rotate_v4i64_3(<4 x i64> %a) nounwind {
134127
; CHECK-LABEL: byte_rotate_v4i64_3:
135128
; CHECK: # %bb.0:
136-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
137-
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI11_0)
138-
; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0
139-
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
129+
; CHECK-NEXT: xvbsrl.v $xr1, $xr0, 8
130+
; CHECK-NEXT: xvbsll.v $xr0, $xr0, 8
131+
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
140132
; CHECK-NEXT: ret
141133
%shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
142134
ret <4 x i64> %shuffle

0 commit comments

Comments
 (0)