Skip to content

Commit 7475f0a

Browse files
authored
DAG: Avoid forming shufflevector from a single extract_vector_elt (#122672)
This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap.
1 parent afc43a7 commit 7475f0a

File tree

2 files changed

+36
-10
lines changed

2 files changed

+36
-10
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+31-5
Original file line numberDiff line numberDiff line change
@@ -23807,6 +23807,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2380723807
SmallVector<SDValue, 8> VecIn;
2380823808
VecIn.push_back(SDValue());
2380923809

23810+
// If we have a single extract_element with a constant index, track the index
23811+
// value.
23812+
unsigned OneConstExtractIndex = ~0u;
23813+
23814+
// Count the number of extract_vector_elt sources (i.e. non-constant or undef)
23815+
unsigned NumExtracts = 0;
23816+
2381023817
for (unsigned i = 0; i != NumElems; ++i) {
2381123818
SDValue Op = N->getOperand(i);
2381223819

@@ -23824,23 +23831,28 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2382423831

2382523832
// Not an undef or zero. If the input is something other than an
2382623833
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
23827-
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23828-
!isa<ConstantSDNode>(Op.getOperand(1)))
23834+
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
2382923835
return SDValue();
23830-
SDValue ExtractedFromVec = Op.getOperand(0);
2383123836

23837+
SDValue ExtractedFromVec = Op.getOperand(0);
2383223838
if (ExtractedFromVec.getValueType().isScalableVector())
2383323839
return SDValue();
23840+
auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
23841+
if (!ExtractIdx)
23842+
return SDValue();
2383423843

23835-
const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
23836-
if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
23844+
if (ExtractIdx->getAsAPIntVal().uge(
23845+
ExtractedFromVec.getValueType().getVectorNumElements()))
2383723846
return SDValue();
2383823847

2383923848
// All inputs must have the same element type as the output.
2384023849
if (VT.getVectorElementType() !=
2384123850
ExtractedFromVec.getValueType().getVectorElementType())
2384223851
return SDValue();
2384323852

23853+
OneConstExtractIndex = ExtractIdx->getZExtValue();
23854+
++NumExtracts;
23855+
2384423856
// Have we seen this input vector before?
2384523857
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
2384623858
// a map back from SDValues to numbers isn't worth it.
@@ -23863,6 +23875,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
2386323875
// VecIn accordingly.
2386423876
bool DidSplitVec = false;
2386523877
if (VecIn.size() == 2) {
23878+
// If we only found a single constant indexed extract_vector_elt feeding the
23879+
// build_vector, do not produce a more complicated shuffle if the extract is
23880+
// cheap with other constant/undef elements. Skip broadcast patterns with
23881+
// multiple uses in the build_vector.
23882+
23883+
// TODO: This should be more aggressive about skipping the shuffle
23884+
// formation, particularly if VecIn[1].hasOneUse(), and regardless of the
23885+
// index.
23886+
if (NumExtracts == 1 &&
23887+
TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
23888+
TLI.isTypeLegal(VT.getVectorElementType()) &&
23889+
TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
23890+
return SDValue();
23891+
2386623892
unsigned MaxIndex = 0;
2386723893
unsigned NearestPow2 = 0;
2386823894
SDValue Vec = VecIn.back();

Diff for: llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

+5-5
Original file line numberDiff line numberDiff line change
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
452452
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
453453
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
454454
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
455-
; GCN-NEXT: v_mov_b32_e32 v3, s1
456-
; GCN-NEXT: v_mov_b32_e32 v0, s2
457-
; GCN-NEXT: v_mov_b32_e32 v1, s3
458-
; GCN-NEXT: v_mov_b32_e32 v2, s0
459-
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
455+
; GCN-NEXT: v_mov_b32_e32 v0, s0
456+
; GCN-NEXT: v_mov_b32_e32 v2, s2
457+
; GCN-NEXT: v_mov_b32_e32 v1, s1
458+
; GCN-NEXT: v_mov_b32_e32 v3, s3
459+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
460460
; GCN-NEXT: s_endpgm
461461
entry:
462462
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel

0 commit comments

Comments
 (0)