diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 63423463eeee2..fa704ddd577df 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1069,7 +1069,8 @@ class SelectionDAG { SDValue EVL); /// Returns sum of the base pointer and offset. - /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default. + /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by + /// default. SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags = SDNodeFlags()); SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL, @@ -1077,15 +1078,18 @@ class SelectionDAG { /// Create an add instruction with appropriate flags when used for /// addressing some offset of an object. i.e. if a load is split into multiple - /// components, create an add nuw from the base pointer to the offset. + /// components, create an add nuw inbounds from the base pointer to the + /// offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } /// Return a new CALLSEQ_START node, that starts new call frame, in which diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b322fe670d4a7..3b63971d21b8a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1213,9 +1213,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) { SDNodeFlags NewFlags; - if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && - Flags.hasNoUnsignedWrap()) - NewFlags |= SDNodeFlags::NoUnsignedWrap; + if (N0.getOpcode() == ISD::ADD) { + if (N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) + NewFlags |= SDNodeFlags::NoUnsignedWrap; + if (N0->getFlags().hasInBounds() && Flags.hasInBounds()) + NewFlags |= SDNodeFlags::InBounds; + } if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d6dcb3f15ae7c..53f7aad5d6998 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8181,7 +8181,7 @@ static SDValue getMemcpyLoadsAndStores( if (Value.getNode()) { Store = DAG.getStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo); OutChains.push_back(Store); } @@ -8206,14 +8206,14 @@ static SDValue getMemcpyLoadsAndStores( Value = DAG.getExtLoad( ISD::EXTLOAD, dl, NVT, Chain, - DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl), + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), SrcPtrInfo.getWithOffset(SrcOff), VT, commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo); OutLoadChains.push_back(Value.getValue(1)); Store = DAG.getTruncStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo); OutStoreChains.push_back(Store); } @@ -8350,7 +8350,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, Value = DAG.getLoad( VT, dl, Chain, - DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl), + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); @@ -8365,7 +8365,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, Store = DAG.getStore( Chain, dl, LoadValues[i], - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo); OutChains.push_back(Store); DstOff += VTSize; @@ -8497,7 +8497,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl, assert(Value.getValueType() == VT && "Value with wrong type."); SDValue Store = DAG.getStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone, NewAAInfo); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 536bf0c208752..62c009d06a4de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1744,72 +1744,82 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 0959687d3834c..aeeafbdb77eca 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -16,8 +16,8 @@ ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) -; FIXME the offset here should not be folded: if %p points to the beginning of -; scratch or LDS and %i is -1, a folded offset crashes the program. +; The offset here cannot be folded: if %p points to the beginning of scratch or +; LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX90A-LABEL: flat_offset_maybe_oob: ; GFX90A: ; %bb.0: @@ -26,7 +26,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -37,7 +39,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -46,7 +50,8 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 +; GFX942-NEXT: flat_load_dword v0, v[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -57,9 +62,12 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -76,7 +84,10 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 @@ -156,3 +167,350 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } + +; If the GEP that adds the offset is inbounds, folding the offset is legal. +define i32 @flat_offset_inbounds(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load i32, ptr %arrayidx + ret i32 %l +} + +define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds_wide: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX90A-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds_wide: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX10-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds_wide: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX942-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds_wide: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: flat_load_b32 v8, v[0:1] offset:28 +; GFX11-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds_wide: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b32 v8, v[0:1] offset:28 +; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <5 x i32>, ptr %arrayidx + store <5 x i32> %l, ptr %pout + ret void +} + +define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds_very_wide: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX90A-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX90A-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX90A-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX90A-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX90A-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX90A-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX90A-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX90A-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX90A-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds_very_wide: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x8 +; GFX10-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX10-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX10-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX10-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX10-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX10-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX10-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX10-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX10-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX10-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX10-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX10-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX10-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX10-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds_very_wide: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX942-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX942-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX942-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX942-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX942-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX942-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX942-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX942-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX942-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds_very_wide: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: flat_load_b128 v[4:7], v[0:1] offset:108 +; GFX11-NEXT: flat_load_b128 v[8:11], v[0:1] offset:124 +; GFX11-NEXT: flat_load_b128 v[12:15], v[0:1] offset:76 +; GFX11-NEXT: flat_load_b128 v[16:19], v[0:1] offset:92 +; GFX11-NEXT: flat_load_b128 v[20:23], v[0:1] offset:44 +; GFX11-NEXT: flat_load_b128 v[24:27], v[0:1] offset:60 +; GFX11-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX11-NEXT: flat_load_b128 v[32:35], v[0:1] offset:28 +; GFX11-NEXT: flat_load_b128 v[36:39], v[0:1] offset:140 +; GFX11-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[4:7] offset:96 +; GFX11-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[8:11] offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[12:15] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[16:19] offset:80 +; GFX11-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[24:27] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[32:35] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds_very_wide: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-NEXT: s_clause 0x8 +; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1] offset:108 +; GFX12-NEXT: flat_load_b128 v[8:11], v[0:1] offset:124 +; GFX12-NEXT: flat_load_b128 v[12:15], v[0:1] offset:76 +; GFX12-NEXT: flat_load_b128 v[16:19], v[0:1] offset:92 +; GFX12-NEXT: flat_load_b128 v[20:23], v[0:1] offset:44 +; GFX12-NEXT: flat_load_b128 v[24:27], v[0:1] offset:60 +; GFX12-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX12-NEXT: flat_load_b128 v[32:35], v[0:1] offset:28 +; GFX12-NEXT: flat_load_b128 v[36:39], v[0:1] offset:140 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x808 +; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7] offset:96 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x708 +; GFX12-NEXT: flat_store_b128 v[2:3], v[8:11] offset:112 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x608 +; GFX12-NEXT: flat_store_b128 v[2:3], v[12:15] offset:64 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x508 +; GFX12-NEXT: flat_store_b128 v[2:3], v[16:19] offset:80 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x408 +; GFX12-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x308 +; GFX12-NEXT: flat_store_b128 v[2:3], v[24:27] offset:48 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x208 +; GFX12-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x108 +; GFX12-NEXT: flat_store_b128 v[2:3], v[32:35] offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <35 x i32>, ptr %arrayidx + store <35 x i32> %l, ptr %pout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 874dece6b728d..0c55c91ba8dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -11,18 +11,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 @@ -37,17 +41,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 0f1c1cf0d80af..0fafb1dc42a6e 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -16031,6 +16031,241 @@ entry: ret void } +define void @memset_p0_sz19(ptr addrspace(0) %dst) { +; CHECK-LABEL: memset_p0_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x41 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0x4141 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v5, s5 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 +; CHECK-NEXT: flat_store_short v[0:1], v7 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p0_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v2, 0x41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:16 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:3 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p0_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v6, 0x41 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v7, 0x4141 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: flat_store_byte v[0:1], v6 offset:18 +; UNROLL3-NEXT: flat_store_short v[0:1], v7 offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p1_sz19(ptr addrspace(1) %dst) { +; CHECK-LABEL: memset_p1_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:15 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p1_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v2, 0x41 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:18 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:17 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:16 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:15 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:14 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:13 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:12 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:11 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:10 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:9 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:8 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:7 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:6 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:5 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:4 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:3 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:2 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:1 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p1_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v2, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v2 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; UNROLL3-NEXT: global_store_dword v[0:1], v2, off offset:15 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p3_sz19(ptr addrspace(3) %dst) { +; CHECK-LABEL: memset_p3_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x41 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x4141 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: ds_write_b8 v0, v3 offset:18 +; CHECK-NEXT: ds_write_b16 v0, v4 offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p3_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:18 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:17 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:16 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:15 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:14 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:13 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:12 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:11 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:10 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:9 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:8 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:7 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:6 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:5 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:4 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:3 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:2 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:1 +; ALIGNED-NEXT: ds_write_b8 v0, v1 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p3_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v3, 0x41 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v4, 0x4141 +; UNROLL3-NEXT: v_mov_b32_e32 v1, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s5 +; UNROLL3-NEXT: ds_write_b8 v0, v3 offset:18 +; UNROLL3-NEXT: ds_write_b16 v0, v4 offset:16 +; UNROLL3-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:1 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p5_sz19(ptr addrspace(5) %dst) { +; CHECK-LABEL: memset_p5_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4141 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p5_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p5_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v2, 0x41 +; UNROLL3-NEXT: v_mov_b32_e32 v3, 0x4141 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; UNROLL3-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 @@ -16046,4 +16281,10 @@ declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr a declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memset.p0.i64(ptr addrspace(0) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3 + attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 39af91b81110d..60aac9ad56f06 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s2, s0 ; CHECK-NEXT: s_addc_u32 s1, s3, s1 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol