DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

arsenm · 2025-01-13T07:39:05Z

This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.

Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.

arsenm · 2025-01-13T07:39:15Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-01-13T07:39:32Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: Matt Arsenault (arsenm)

Changes

This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.

Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.

Full diff: https://github.com/llvm/llvm-project/pull/122672.diff

3 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+20-5)
(modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+5-5)
(modified) llvm/test/CodeGen/X86/sse41.ll (+4-4)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 712e52ee8fc921..49ad0b526602eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23799,6 +23799,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   SmallVector<SDValue, 8> VecIn;
   VecIn.push_back(SDValue());
 
+  // If we have a single extract_element with a constant index, track the index
+  // value.
+  unsigned OneConstExtractIndex = ~0u;
+
   for (unsigned i = 0; i != NumElems; ++i) {
     SDValue Op = N->getOperand(i);
 
@@ -23816,16 +23820,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
 
     // Not an undef or zero. If the input is something other than an
     // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
-    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(Op.getOperand(1)))
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
-    SDValue ExtractedFromVec = Op.getOperand(0);
 
+    SDValue ExtractedFromVec = Op.getOperand(0);
     if (ExtractedFromVec.getValueType().isScalableVector())
       return SDValue();
+    auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!ExtractIdx)
+      return SDValue();
 
-    const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
-    if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+    if (ExtractIdx->getAsAPIntVal().uge(
+            ExtractedFromVec.getValueType().getVectorNumElements()))
       return SDValue();
 
     // All inputs must have the same element type as the output.
@@ -23833,6 +23839,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
         ExtractedFromVec.getValueType().getVectorElementType())
       return SDValue();
 
+    OneConstExtractIndex = ExtractIdx->getZExtValue();
+
     // Have we seen this input vector before?
     // The vectors are expected to be tiny (usually 1 or 2 elements), so using
     // a map back from SDValues to numbers isn't worth it.
@@ -23855,6 +23863,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   // VecIn accordingly.
   bool DidSplitVec = false;
   if (VecIn.size() == 2) {
+    // If we only found a single constant indexed extract_vector_elt feeding the
+    // build_vector, do not produce a more complicated shuffle if the extract is
+    // cheap.
+    if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+        TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+      return SDValue();
+
     unsigned MaxIndex = 0;
     unsigned NearestPow2 = 0;
     SDValue Vec = VecIn.back();
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d1..add8c0f75bf335 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
 ; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d09..6dac18c927e114 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -2124,14 +2124,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
 ; AVX1-LABEL: build_vector_to_shuffle_1:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX1-NEXT:    vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
 ; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 ; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
 ; AVX512-LABEL: build_vector_to_shuffle_1:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
+; AVX512-NEXT:    vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05]
 ; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 ; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %vecext = extractelement <4 x float> %A, i32 1
@@ -2152,14 +2152,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
 ; AVX1-LABEL: build_vector_to_shuffle_2:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX1-NEXT:    vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
 ; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
 ; AVX512-LABEL: build_vector_to_shuffle_2:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
+; AVX512-NEXT:    vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d]
 ; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %vecext = extractelement <4 x float> %A, i32 1

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

RKSimon

LGTM

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

preames

I don't think the heuristic here is quite what you want. I believe this heuristic disables both of the following cases:

BuildVector w/one non-zero non-undef element
BuildVector w/one non-zero non-undef source, repeated 100 times (i.e. splat or select of two splats)

Disabling the former seems defensible, doing so for the second less so.

Though honestly, I'm not sure of this change as a whole. Having a single canonical form seems valuable here. If the target isn't optimally lowering the splat or select of splat case in the shuffle lowering, maybe we should just adjust the target lowering to do so?

arsenm · 2025-01-14T16:18:02Z

I don't think the heuristic here is quite what you want. I believe this heuristic disables both of the following cases:

BuildVector w/one non-zero non-undef element

BuildVector w/one non-zero non-undef source, repeated 100 times (i.e. splat or select of two splats)

Disabling the former seems defensible, doing so for the second less so.

Depends if isExtractVecEltCheap is true or not for the non-zero index.

BuildVector w/one non-zero non-undef source, repeated 100 times (i.e. splat or select of two splats)

I don't follow, this is a 2 element vector, how can you have 100 variants?

If the target isn't optimally lowering the splat or select of splat case in the shuffle lowering, maybe we should just adjust the target lowering to do so?t

It's not a lowering issue, it's the effect on every other combine. We'd have to special case 1 element + 1 undef shuffles everywhere we handle extract_vector_elt now, which is just excessive complexity. #122671 is almost an alternative in one instance, but still shows expanding complexity of handling this edge case.

preames · 2025-01-14T17:04:57Z

BuildVector w/one non-zero non-undef source, repeated 100 times (i.e. splat or select of two splats)

I don't follow, this is a 2 element vector, how can you have 100 variants?

Isn't the condition in code in terms of VecIn.size() == 2? I believe that VecIn is the unique input elements, right? Which is distinct from the number of elements in the destination type? (Am I just misreading? I only skimmed this.)

If the target isn't optimally lowering the splat or select of splat case in the shuffle lowering, maybe we should just adjust the target lowering to do so?t

It's not a lowering issue, it's the effect on every other combine. We'd have to special case 1 element + 1 undef shuffles everywhere we handle extract_vector_elt now, which is just excessive complexity. #122671 is almost an alternative in one instance, but still shows expanding complexity of handling this edge case.

Honestly, #122671 (from the review description only) sounds like a worthwhile change. That's not a hugely compelling argument here. Let's settle the prior point, and then return to this. If I'm just misreading something, let's not waste time discussing this.

arsenm · 2025-01-14T17:18:05Z

Isn't the condition in code in terms of VecIn.size() == 2? I believe that VecIn is the unique input elements, right? Which is distinct from the number of elements in the destination type? (Am I just misreading? I only skimmed this.)

VecIn is collecting only extract_vector_elts feeding the build_vector. So it's true it's not only a 2 element vector, in general (but the standard case of building a complete vector is 2 elements). The other skipped elements are all constant or undef.

A 2 element shuffle just happens to the only case I care about which I'm trying to make legal (and really only the odd -> even case is of any use).

preames · 2025-01-14T17:23:32Z

Isn't the condition in code in terms of VecIn.size() == 2? I believe that VecIn is the unique input elements, right? Which is distinct from the number of elements in the destination type? (Am I just misreading? I only skimmed this.)

VecIn is collecting only extract_vector_elts feeding the build_vector. So it's true it's not only a 2 element vector, in general (but the standard case of building a complete vector is 2 elements). The other skipped elements are all constant or undef.

A 2 element shuffle just happens to the only case I care about which I'm trying to make legal (and really only the odd -> even case is of any use).

This is exactly the distinct I'm trying to get at. Avoiding the creation of a 1-2 element shuffle seems quite reasonable. Avoiding the creation of a 100 element splat shuffle does not. I think you need to add an explicit condition in terms of the number elements in the result, not the number of unique elements in the result.

arsenm · 2025-01-16T07:58:25Z

This is exactly the distinct I'm trying to get at. Avoiding the creation of a 1-2 element shuffle seems quite reasonable. Avoiding the creation of a 100 element splat shuffle does not. I think you need to add an explicit condition in terms of the number elements in the result, not the number of unique elements in the result.

This is enough to solve my immediate issue to restrict it to a single extract use in the build_vector. However it does seem to be much more conservative than necessary. This new heuristic gives up the x86 improvements. I see many lit improvements with various more aggressive variants, but regressions in others. I think this would require more time working on x86, I think some of the regressions are due to missing optimizations there

github-actions · 2025-01-16T07:59:25Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' ca955197047ce044dec1e85fd401b1788550602d 047b467cce793fb89c43eb5f90db09d42b192fe3 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

The following files introduce new uses of undef:

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

preames

LGTM

arsenm · 2025-01-17T01:30:53Z

Merge activity

Jan 16, 8:30 PM EST: A user started a stack merge that includes this pull request via Graphite.
Jan 16, 8:39 PM EST: Graphite rebased this pull request as part of a merge.
Jan 16, 8:42 PM EST: Graphite rebased this pull request as part of a merge.
Jan 16, 8:44 PM EST: A user merged this pull request with Graphite.

This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap.

…mprovements

arsenm mentioned this pull request Jan 13, 2025

DAG: Fix vector bin op scalarize defining a partially undef vector #122459

Merged

arsenm added the llvm:SelectionDAG SelectionDAGISel as well label Jan 13, 2025 — with Graphite App

arsenm mentioned this pull request Jan 13, 2025

AMDGPU: Implement isExtractVecEltCheap #122460

Merged

arsenm requested review from davemgreen, preames, RKSimon and topperc January 13, 2025 07:39

arsenm marked this pull request as ready for review January 13, 2025 07:39

llvmbot added backend:AMDGPU backend:X86 labels Jan 13, 2025

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from 8760ea5 to b9ec561 Compare January 13, 2025 11:32

RKSimon reviewed Jan 13, 2025

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

RKSimon approved these changes Jan 13, 2025

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

preames reviewed Jan 14, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from b93cb29 to ea6a8ce Compare January 14, 2025 16:41

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from b9ec561 to acdbd89 Compare January 14, 2025 16:41

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from ea6a8ce to 78a65e5 Compare January 15, 2025 04:17

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from acdbd89 to cc2d6ff Compare January 15, 2025 04:17

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from 78a65e5 to 7facaf4 Compare January 16, 2025 07:55

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from cc2d6ff to 013b1aa Compare January 16, 2025 07:56

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from 7facaf4 to da5089e Compare January 16, 2025 11:42

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from 013b1aa to bb1a074 Compare January 16, 2025 11:43

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from da5089e to 93c5e27 Compare January 16, 2025 12:55

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from bb1a074 to 3a4b222 Compare January 16, 2025 12:55

preames approved these changes Jan 16, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu/implement-isExtractVecEltCheap branch from 93c5e27 to 4170ae5 Compare January 17, 2025 01:35

Base automatically changed from users/arsenm/amdgpu/implement-isExtractVecEltCheap to main January 17, 2025 01:38

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from 3a4b222 to 047b467 Compare January 17, 2025 01:38

arsenm added 5 commits January 17, 2025 01:41

Check hasOneUse on the source vector. This gives up some of the x86 i…

a4823b0

…mprovements

Count the number of extract uses

e845f03

Avoid arm regressions

7cd9367

Cleanup and comment

0f588c8

arsenm force-pushed the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch from 047b467 to 0f588c8 Compare January 17, 2025 01:41

arsenm merged commit 7475f0a into main Jan 17, 2025
4 of 7 checks passed

arsenm deleted the users/arsenm/dag-do-not-form-shuffle-for-single-extract-element branch January 17, 2025 01:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

Uh oh!

arsenm commented Jan 13, 2025

Uh oh!

arsenm commented Jan 13, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jan 13, 2025 •

edited

Loading

Uh oh!

Uh oh!

RKSimon left a comment

Uh oh!

Uh oh!

preames left a comment

Uh oh!

arsenm commented Jan 14, 2025

Uh oh!

preames commented Jan 14, 2025

Uh oh!

arsenm commented Jan 14, 2025

Uh oh!

preames commented Jan 14, 2025

Uh oh!

arsenm commented Jan 16, 2025

Uh oh!

github-actions bot commented Jan 16, 2025 •

edited

Loading

Uh oh!

preames left a comment

Uh oh!

arsenm commented Jan 17, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

DAG: Avoid forming shufflevector from a single extract_vector_elt #122672

Uh oh!

Conversation

arsenm commented Jan 13, 2025

Uh oh!

arsenm commented Jan 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Jan 14, 2025

Uh oh!

preames commented Jan 14, 2025

Uh oh!

arsenm commented Jan 14, 2025

Uh oh!

preames commented Jan 14, 2025

Uh oh!

arsenm commented Jan 16, 2025

Uh oh!

github-actions bot commented Jan 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Jan 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

Uh oh!

arsenm commented Jan 13, 2025 •

edited

Loading

llvmbot commented Jan 13, 2025 •

edited

Loading

github-actions bot commented Jan 16, 2025 •

edited

Loading

arsenm commented Jan 17, 2025 •

edited

Loading