-
Notifications
You must be signed in to change notification settings - Fork 13.3k
DAG: Avoid forming shufflevector from a single extract_vector_elt #122672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Avoid forming shufflevector from a single extract_vector_elt #122672
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesThis avoids regressions in a future AMDGPU commit. Previously we Alternatively could check aggressivelyPreferBuildVectorSources, but Full diff: https://github.com/llvm/llvm-project/pull/122672.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 712e52ee8fc921..49ad0b526602eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23799,6 +23799,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
SmallVector<SDValue, 8> VecIn;
VecIn.push_back(SDValue());
+ // If we have a single extract_element with a constant index, track the index
+ // value.
+ unsigned OneConstExtractIndex = ~0u;
+
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);
@@ -23816,16 +23820,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// Not an undef or zero. If the input is something other than an
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
- if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(Op.getOperand(1)))
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
- SDValue ExtractedFromVec = Op.getOperand(0);
+ SDValue ExtractedFromVec = Op.getOperand(0);
if (ExtractedFromVec.getValueType().isScalableVector())
return SDValue();
+ auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!ExtractIdx)
+ return SDValue();
- const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
- if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+ if (ExtractIdx->getAsAPIntVal().uge(
+ ExtractedFromVec.getValueType().getVectorNumElements()))
return SDValue();
// All inputs must have the same element type as the output.
@@ -23833,6 +23839,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();
+ OneConstExtractIndex = ExtractIdx->getZExtValue();
+
// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
// a map back from SDValues to numbers isn't worth it.
@@ -23855,6 +23863,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// VecIn accordingly.
bool DidSplitVec = false;
if (VecIn.size() == 2) {
+ // If we only found a single constant indexed extract_vector_elt feeding the
+ // build_vector, do not produce a more complicated shuffle if the extract is
+ // cheap.
+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+ TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ return SDValue();
+
unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
SDValue Vec = VecIn.back();
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d1..add8c0f75bf335 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d09..6dac18c927e114 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -2124,14 +2124,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_1:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_1:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
@@ -2152,14 +2152,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
; AVX1-LABEL: build_vector_to_shuffle_2:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512-LABEL: build_vector_to_shuffle_2:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%vecext = extractelement <4 x float> %A, i32 1
|
8760ea5
to
b9ec561
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think the heuristic here is quite what you want. I believe this heuristic disables both of the following cases:
- BuildVector w/one non-zero non-undef element
- BuildVector w/one non-zero non-undef source, repeated 100 times (i.e. splat or select of two splats)
Disabling the former seems defensible, doing so for the second less so.
Though honestly, I'm not sure of this change as a whole. Having a single canonical form seems valuable here. If the target isn't optimally lowering the splat or select of splat case in the shuffle lowering, maybe we should just adjust the target lowering to do so?
Depends if isExtractVecEltCheap is true or not for the non-zero index.
I don't follow, this is a 2 element vector, how can you have 100 variants?
It's not a lowering issue, it's the effect on every other combine. We'd have to special case 1 element + 1 undef shuffles everywhere we handle extract_vector_elt now, which is just excessive complexity. #122671 is almost an alternative in one instance, but still shows expanding complexity of handling this edge case. |
b93cb29
to
ea6a8ce
Compare
b9ec561
to
acdbd89
Compare
Isn't the condition in code in terms of VecIn.size() == 2? I believe that VecIn is the unique input elements, right? Which is distinct from the number of elements in the destination type? (Am I just misreading? I only skimmed this.)
Honestly, #122671 (from the review description only) sounds like a worthwhile change. That's not a hugely compelling argument here. Let's settle the prior point, and then return to this. If I'm just misreading something, let's not waste time discussing this. |
VecIn is collecting only extract_vector_elts feeding the build_vector. So it's true it's not only a 2 element vector, in general (but the standard case of building a complete vector is 2 elements). The other skipped elements are all constant or undef. A 2 element shuffle just happens to the only case I care about which I'm trying to make legal (and really only the odd -> even case is of any use). |
This is exactly the distinct I'm trying to get at. Avoiding the creation of a 1-2 element shuffle seems quite reasonable. Avoiding the creation of a 100 element splat shuffle does not. I think you need to add an explicit condition in terms of the number elements in the result, not the number of unique elements in the result. |
ea6a8ce
to
78a65e5
Compare
acdbd89
to
cc2d6ff
Compare
78a65e5
to
7facaf4
Compare
cc2d6ff
to
013b1aa
Compare
This is enough to solve my immediate issue to restrict it to a single extract use in the build_vector. However it does seem to be much more conservative than necessary. This new heuristic gives up the x86 improvements. I see many lit improvements with various more aggressive variants, but regressions in others. I think this would require more time working on x86, I think some of the regressions are due to missing optimizations there |
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' ca955197047ce044dec1e85fd401b1788550602d 047b467cce793fb89c43eb5f90db09d42b192fe3 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
7facaf4
to
da5089e
Compare
013b1aa
to
bb1a074
Compare
da5089e
to
93c5e27
Compare
bb1a074
to
3a4b222
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Merge activity
|
93c5e27
to
4170ae5
Compare
3a4b222
to
047b467
Compare
This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap.
047b467
to
0f588c8
Compare
This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.
Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.